Attachment: Source Code function [intensity_data, intensity_pos]=readintensity(filename) % readintensity - reads in intensity data coming out of GSEQ. The first row % of intensity data is dummie row. The second row contains sample id and % intensity value for sense and antisense % [intensity_data, intensity_pos]=readintensity(filename) % % Result: % intensity_data: returns (16544,n*8) matrix which stores all intensity % readings for n samples. % intensity_pos: stores the position of each row % % Parameters: % filename: intensity readings file name from GSEQ % % Created for Matlab R14 % version 0.9 beta (April, 2011) % Mike Xie % email: hongbo.michael.xie@gmail.com % % File history: % 0.1 (March, 2010) - created % 0.9 beta (April, 2011) - modified version % read in fid=fopen(filename,'r'); % get rid of first row line=fgetl(fid); % get the second row to find sample id, not sure if you need it, but keep % it as an option, in case you may need it later line=fgetl(fid); index=[findstr(line,char(9)) length(line)+1]; st=1; for i=3:8:length(index) templine=line(index(i-1)+1:index(i)-1); indextemp=findstr(templine,'_'); id_intensity{st}=templine(1:indextemp(1)-1); st=st+1; end % make is faster intensity_data=zeros(16544,length(id_intensity)*8); st=0; intensity_pos=[]; while feof(fid)==0 line=fgetl(fid); if isempty(findstr(line,'human_mtDNA_RCRS'))==0 st=st+1; index=findstr(line,char(9)); intensity_data(st,:)=str2num(line(index(2)+1:length(line))); templine=line(index(1)+1:index(2)-1); index1=findstr(templine,'-'); % store position, not sure if necessary since things are pretty % standard intensity_pos(st)=str2num(templine(index1(1)+1:length(templine))); end end fclose(fid); function qualitycontrol(qc,idlist_h) % qualitycontrol - identify quality issues with quality score for each % sample % qualitycontrol(qc,idlist_h) % Parameters: % qc: quality score from haploid model % idlist_h: id list % Created for Matlab R14 % version 0.9 beta (April, 2011) % Mike Xie % email: hongbo.michael.xie@gmail.com % % File history: % 0.1 (March, 2010) - created % 0.9 beta (April, 2011) - modified version data_gseq=qc(1:16544,:); figure imagesc(corrcoef(data_gseq)); xlabel('Sample id') ylabel('Sample id') title('Heatmap Correlation Coefficient ') pct_gseq=[]; tempdata=[]; % plot 25 moving window for i=1:length(idlist_h) st=1; k=zeros(16544-24,1); for j=25:16544 k(st)=mean(data_gseq(j-24:j,i)); st=st+1; end tempdata=[tempdata k]; end %plot 50 moving window tempdata1=[]; for i=1:length(idlist_h); st=1; k=zeros(16544-49,1); for j=50:16544 k(st)=mean(data_gseq(j-49:j,i)); st=st+1; end tempdata1=[tempdata1 k]; end figure; subplot(2,1,1),plot(tempdata); % plot quality score distribution xlabel('Genomic Position') ylabel('Quality Score') title('Quality Score Distribution') ylim([-5, max(max(tempdata))+5]); xlim([1 16544]); subplot(2,1,2),plot(tempdata1); xlim([1 16544]); ylim([-5, max(max(tempdata1))+5]); xlabel('Genomic Position') ylabel('Quality Score') title('Quality Score Distribution') figure; hist(data_gseq,20); xlabel('Quality Score') ylabel('# of base') title('Histograph of quality score ') figure [a,b]=size(qc); c=mod(b,4); if c==0 d=b/4; else d=ceil(b/4); end st=0; for i=1:b if st<20 st=st+1; subplot(5,4,st); hist(data_gseq(:,i),20); else st=1; figure; subplot(5,4,st); hist(data_gseq(:,i),20); end xlabel('Quality Score') ylabel('# of base') name=strrep(idlist_h(i).id,'_','-'); title(['Histograph-' name]); end function [finalcheck,idlist_h]=detectheteroplasmy(intensity_data,check_index,filename,idlist_h,removelist,refseq) % detectheteroplasmy - Detecting potential heteroplasmy sites using % intensity profiles of samples. It compares intensity profile of given % base position within sample itself and across all samples to identify % potential heteroplasmy loci. % finalcheck=detectheteroplasmy(intensity_data,check_index,filename,idlist_h,removelist,refseq) % % Result: % finalcheck: stores the sample index and position which are identified % as heteroplasmy bases % idlist_h: store the result into structure format % % Parameters: % intensity_data: read_in data from function readintensity(). % check_index: to reduce computational cost, we only consider loci % where there is a difference between GSEQ diploid model and haploid % model; this var stores all sample index and its positions as detected % difference between two GSEQ models % filename: files you want to store your heteroplasmy results % idlist_h: Sample information detected and parsed from previous steps. % such as SNPs, indels, etc; using post_clause _h is due to we mainly % use haploid model results from GSEQ % removelist: sample index you want to remove due to quality problems % refseq: the sanger reference sequences for output purpose % % This file does not require the Statistics Toolbox. % % Created for Matlab R14 % version 0.9 beta (April, 2011) % Mike Xie % email: hongbo.michael.xie@gmail.com % % File history: % 0.1 (March, 2010) - created % 0.9 beta (April, 2011) - modified version % store the column index of removed samples 8 column each sample, sense and % antisense removeblock=[]; for i=1:length(removelist) removeblock=[removeblock removelist(i)*8-7:removelist(i)*8]; end % find the dimension of the intensity data [a,b]=size(intensity_data); %find the column index of good samples index_good_data=setdiff(1:b,removeblock); % retrive data from good samples to start idg=intensity_data(:,index_good_data); % sample index for good samples index_good=setdiff(1:length(idlist_h),removelist); % size of good sample [c,d]=size(idg); % we want to modify raw data as equation 1, fdata will store the modified % data in order to detect heteroplasmy fdata=[]; % we convert the data according to equation 1 for j=1:length(index_good) % for one sample, get its intensity readings for 8 columns index=j*8-7:j*8; data=idg(:,index); % I want to store the converted data difff=[]; for i=1:a % sense log intensity ts1=log(data(i,1:4)/min(data(i,1:4))); %antisense tas1=log(data(i,5:8)/min(data(i,5:8))); % sum of intensity (product of intensity) t1=(ts1([4 3 2 1])+tas1)/2; [ta,tb]=sort(t1,'descend'); % convert as suggested in paper difff=[difff; ta(1)+ta(4)-ta(2)*2]; end fdata=[fdata difff]; end % store my result of heteroplasmy detection finalcheck=[]; fid=fopen(filename,'w'); base='acgt'; fprintf(fid,'%1s\t%1s\t%1s\t%1s\t%1s\t%1s\t%1s\t%1s\n','id','position','reference sequence','aboundant base','fraction %','minor base','fraction %','evidence'); for i=1:c s1=std(fdata(i,:)); m1=mean(fdata(i,:)); % you can also use other method to detect "outliers" index1=find(fdata(i,:)=thrsh calli=lower(nuct(b(1))); if strcmp(refcall,calli)==0 fprintf(fidcmp,'%1s\t%1i\t%1s\t%1s\t%6.4f\n',idlist_h(i).id,basepos,refcall,calli,calli_qual); end end fprintf(fid(i),'%1s\t%1s\t%6.4f\n',dummie{j},calli,calli_qual); if strcmp(refcall,calli)==0 & strcmp(calli,'n')==0 fprintf(fidsnp,'%1s\t%1i\t%1s\t%1s\t%6.4f\n',idlist_h(i).id,basepos,refcall,calli,calli_qual); snpcall{i}=[snpcall{i} basepos]; end end end fclose(fidsnp); fclose(fidcmp); for i=1:nfiles fclose(fid(i)); end function output(qc) % output - output results % qualitycontrol(qc,idlist_h) % Parameters: % qc: quality score from haploid model % Created for Matlab R14 % version 0.9 beta (April, 2011) % Mike Xie % email: hongbo.michael.xie@gmail.com % % File history: % 0.1 (March, 2010) - created % 0.9 beta (April, 2011) - modified version data_gseq=qc(1:16544,:); pct_gseq=[]; [c,d]=size(data_gseq); for i=1:d pct_gseq=[pct_gseq length(find(data_gseq(:,i)>=3))/16544*100]; end fid=fopen('chplist.txt','r'); [gene,st,ed]=textread('genelist1.txt','%s %u %u'); finalcall=[]; totalmiss=[]; stt=0; id=[]; while feof(fid)==0 line=fgetl(fid); stt=stt+1; id{stt}=line; [pos,ref,cal,qual]=textread(line,'%*u %*s %u %*u %s %*s %s %s','delimiter',char(9),'headerlines',2); index=find(pos>3106); pos(index)=pos(index)+1; cal_temp=[]; miss=zeros(length(gene),1); for i=1:length(cal) cal_temp=[cal_temp;cal{i}]; end totalmiss=[totalmiss;100-length(find(cal_temp=='n'))/length(cal_temp)*100]; for i=1:length(gene) index=find(pos>=st(i)-12 & pos<=ed(i)-12); for j=1:length(index) if strcmp(cal{index(j)},'n') miss(i)=miss(i)+1; end end miss(i)=100-miss(i)/length(index)*100; end finalcall=[finalcall miss]; end fclose(fid); fid=fopen('summaryresult.txt','w'); fprintf(fid,'%1s\t','gene'); for i=1:length(id)-1 fprintf(fid,'%1s\t',id{i}); end fprintf(fid,'%1s\n',id{length(id)}); fprintf(fid,'%1s\t','raw_totalpct'); for i=1:length(id)-1 fprintf(fid,'%6.4f\t',pct_gseq(i)); end fprintf(fid,'%6.4f\n',pct_gseq(length(id))); fprintf(fid,'%1s\t','totalpct'); for i=1:length(id)-1 fprintf(fid,'%6.4f\t',totalmiss(i)); end fprintf(fid,'%6.4f\n',totalmiss(length(id))); for i=1:length(gene) fprintf(fid,'%1s\t',gene{i}); for j=1:length(id)-1 fprintf(fid,'%6.4f\t',finalcall(i,j)); end fprintf(fid,'%6.4f\n',finalcall(i,j)); end fclose(fid); function [header,seq]=readfasta(txtfile) fid=fopen(txtfile,'rt'); header=fgetl(fid); seq=[]; while feof(fid)==0 seq=[seq fgetl(fid)]; end %strrep(seq,'//',''); fclose(fid); function [bc,qc,tilepos, idlist,qci,dummie]=readbasecall(filename) % readbasecall - read snp calls from GSEQ output % [bc,qc,tilepos, idlist,qci,dummie]=readbasecall(filename) % % Result: % bc: basce call from GSEQ % qc: quality scores % qci: index column for quality score readings % idlist: store the id list of samples, this will also be the main % var to store results % tilepos: Tiling Position % dummie: annotation for future output. ( I am lazy here) % % Parameters: % filename: input file name % Created for Matlab R14 % version 0.9 beta (April, 2011) % Mike Xie % email: hongbo.michael.xie@gmail.com % % File history: % 0.1 (March, 2010) - created % 0.9 beta (April, 2011) - modified version fid=fopen(filename,'r'); % get rid of the first line of file (junk) line=fgetl(fid); %get the second line of data to find out where to look line=fgetl(fid); index=[findstr(line,char(9)) length(line)+1]; %base call index for each sample bci=[]; %call quality column index for each sample qci=[]; st=0; for i=1:length(index)-1 % identify position of base call column if findstr(line(index(i)+1:index(i+1)-1),'_Call') bci=[bci i]; st=st+1; idlist(st).id=line(index(i)+1:index(i+1)-6); end % identify position of call quality column if findstr(line(index(i)+1:index(i+1)-1),'_Quality') qci=[qci i]; end % identify tiling position in order to find out where to check if findstr(line(index(i)+1:index(i+1)-1),'Tiling Pos') tileposi=i; end end % all base cals bc=[]; % call quality score qc=zeros(36942,length(idlist)); tilepos=[]; st=0; dummie_index=min(bci); while feof(fid)==0 % start to process data line=fgetl(fid); if isempty(findstr(line,'AFFX')) % skip affy control probe st=st+1; index=[findstr(line,char(9)) length(line)+1]; dummie{st}=line(1:index(dummie_index)-1); % set some dummie thing to take the readings bctemp=''; qctemp=[]; for i=1:length(bci) % accumulate quality score and base call bctemp=[bctemp line(index(bci(i))+1:index(bci(i)+1)-1)]; qctemp=[qctemp str2num(line(index(qci(i))+1:index(qci(i)+1)-1))]; end bc=[bc;bctemp]; qc(st,:)=qctemp; tilepos=[tilepos;str2num(line(index(tileposi)+1:index(tileposi+1)-1))]; end end fclose(fid);