Attachment: Source Code


function [intensity_data, intensity_pos]=readintensity(filename)


% readintensity - reads in intensity data coming out of GSEQ. The first row
% of intensity data is dummie row. The second row contains sample id and
% intensity value for sense and antisense

% [intensity_data, intensity_pos]=readintensity(filename)
%
% Result:
%     intensity_data: returns (16544,n*8) matrix which stores all intensity
%      readings for n samples.
%     intensity_pos: stores the position of each row
%
% Parameters:
%     filename: intensity readings file name from GSEQ
%
% Created for Matlab R14
% version 0.9 beta (April, 2011)
% Mike Xie
% email: hongbo.michael.xie@gmail.com
%
% File history:
%   0.1 (March, 2010) - created
%   0.9 beta (April, 2011) - modified version


% read in
fid=fopen(filename,'r');

% get rid of first row
line=fgetl(fid);

% get the second row to find sample id, not sure if you need it, but keep
% it as an option, in case you may need it later
line=fgetl(fid);
index=[findstr(line,char(9)) length(line)+1];
st=1;
for i=3:8:length(index)
    templine=line(index(i-1)+1:index(i)-1);
    indextemp=findstr(templine,'_');
    id_intensity{st}=templine(1:indextemp(1)-1);
    st=st+1;
end

% make is faster
intensity_data=zeros(16544,length(id_intensity)*8);
st=0;
intensity_pos=[];
while feof(fid)==0
    line=fgetl(fid);
    if isempty(findstr(line,'human_mtDNA_RCRS'))==0
        st=st+1;
        index=findstr(line,char(9));
        intensity_data(st,:)=str2num(line(index(2)+1:length(line)));
        
        templine=line(index(1)+1:index(2)-1);
        index1=findstr(templine,'-');
        % store position, not sure if necessary since things are pretty
        % standard
        intensity_pos(st)=str2num(templine(index1(1)+1:length(templine)));
    end
end
fclose(fid);


function qualitycontrol(qc,idlist_h)
% qualitycontrol - identify quality issues with quality score for each
% sample

% qualitycontrol(qc,idlist_h)

% Parameters:
%     qc: quality score from haploid model
%     idlist_h: id list

% Created for Matlab R14
% version 0.9 beta (April, 2011)
% Mike Xie
% email: hongbo.michael.xie@gmail.com
%
% File history:
%   0.1 (March, 2010) - created
%   0.9 beta (April, 2011) - modified version

data_gseq=qc(1:16544,:);
figure


imagesc(corrcoef(data_gseq));
xlabel('Sample id')
ylabel('Sample id')
title('Heatmap Correlation Coefficient ')

pct_gseq=[];
tempdata=[];
% plot 25 moving window
for i=1:length(idlist_h)
    st=1;
    k=zeros(16544-24,1);
    for j=25:16544
        k(st)=mean(data_gseq(j-24:j,i));
        st=st+1;
    end
    tempdata=[tempdata k];
end

%plot 50 moving window
tempdata1=[];
for i=1:length(idlist_h);
    st=1;
    k=zeros(16544-49,1);
    for j=50:16544
        k(st)=mean(data_gseq(j-49:j,i));
        st=st+1;
    end
    tempdata1=[tempdata1 k];
end
figure;
subplot(2,1,1),plot(tempdata);

% plot quality score distribution
xlabel('Genomic Position')
ylabel('Quality Score')
title('Quality Score Distribution')
ylim([-5, max(max(tempdata))+5]);
xlim([1 16544]);

subplot(2,1,2),plot(tempdata1);
xlim([1 16544]);
ylim([-5, max(max(tempdata1))+5]);
xlabel('Genomic Position')
ylabel('Quality Score')
title('Quality Score Distribution')
figure;
hist(data_gseq,20);
xlabel('Quality Score')
ylabel('# of base')
title('Histograph of quality score ')

figure
[a,b]=size(qc);
c=mod(b,4);
if c==0
    d=b/4;
else
    d=ceil(b/4);
end
st=0;
for i=1:b
    if st<20
        st=st+1;
        subplot(5,4,st);
        hist(data_gseq(:,i),20);
    else
        st=1;
        figure;
        subplot(5,4,st);
        hist(data_gseq(:,i),20);
    end
    xlabel('Quality Score')
    ylabel('# of base')
    name=strrep(idlist_h(i).id,'_','-');
    title(['Histograph-' name]);
end


function [finalcheck,idlist_h]=detectheteroplasmy(intensity_data,check_index,filename,idlist_h,removelist,refseq)


% detectheteroplasmy - Detecting potential heteroplasmy sites using
%  intensity profiles of samples. It compares intensity profile of given
%  base position within sample itself and across all samples to identify
%  potential heteroplasmy loci.

% finalcheck=detectheteroplasmy(intensity_data,check_index,filename,idlist_h,removelist,refseq)
%
% Result:
%     finalcheck: stores the sample index and position which are identified
%      as heteroplasmy bases
%     idlist_h: store the result into structure format
%
% Parameters:
%     intensity_data: read_in data from function readintensity().
%     check_index: to reduce computational cost, we only consider loci
%      where there is a difference between GSEQ diploid model and haploid
%      model; this var stores all sample index and its positions as detected
%      difference between two GSEQ models
%     filename: files you want to store your heteroplasmy results
%     idlist_h: Sample information detected and parsed from previous steps.
%      such as SNPs, indels, etc; using post_clause _h is due to we mainly
%      use haploid model results from GSEQ
%     removelist: sample index you want to remove due to quality problems
%     refseq: the sanger reference sequences for output purpose
%
%   This file does not require the Statistics Toolbox.
%
% Created for Matlab R14
% version 0.9 beta (April, 2011)
% Mike Xie
% email: hongbo.michael.xie@gmail.com
%
% File history:
%   0.1 (March, 2010) - created
%   0.9 beta (April, 2011) - modified version




% store the column index of removed samples 8 column each sample, sense and
% antisense
removeblock=[];
for i=1:length(removelist)
    removeblock=[removeblock removelist(i)*8-7:removelist(i)*8];
end

% find the dimension of the intensity data
[a,b]=size(intensity_data);

%find the column index of good samples
index_good_data=setdiff(1:b,removeblock);

% retrive data from good samples to start
idg=intensity_data(:,index_good_data);

% sample index for good samples
index_good=setdiff(1:length(idlist_h),removelist);

% size of good sample
[c,d]=size(idg);

% we want to modify raw data as equation 1, fdata will store the modified
% data in order to detect heteroplasmy
fdata=[];

% we convert the data according to equation 1
for j=1:length(index_good)
    
    % for one sample, get its intensity readings for 8 columns
    index=j*8-7:j*8;
    data=idg(:,index);
    
    
    % I want to store the converted data
    difff=[];
    for i=1:a
        
        % sense log intensity
        ts1=log(data(i,1:4)/min(data(i,1:4)));
        %antisense
        tas1=log(data(i,5:8)/min(data(i,5:8)));
        
        % sum of intensity (product of intensity)
        t1=(ts1([4 3 2 1])+tas1)/2;
        [ta,tb]=sort(t1,'descend');
        
        % convert as suggested in paper
        difff=[difff; ta(1)+ta(4)-ta(2)*2];
    end
    fdata=[fdata difff];
end

% store my result of heteroplasmy detection
finalcheck=[];

fid=fopen(filename,'w');
base='acgt';
fprintf(fid,'%1s\t%1s\t%1s\t%1s\t%1s\t%1s\t%1s\t%1s\n','id','position','reference sequence','aboundant base','fraction %','minor base','fraction %','evidence');


for i=1:c
    
    s1=std(fdata(i,:));
    m1=mean(fdata(i,:));
    % you can also use other method to detect "outliers"
    index1=find(fdata(i,:)<m1-3*s1);
    
    resultt=[];
    if isempty(index1)==0
        for j=1:length(index1)
            
            % find the original intensity data
            temp_s1=idg(i,index1(j)*8-7:index1(j)*8-4);
            temp_s=temp_s1([4 3 2 1]);
            temp_a=idg(i,index1(j)*8-3:index1(j)*8);
            
            [a,b]=sort(temp_s,'descend');
            [a1,b1]=sort(temp_a,'descend');
            
            % fundamentally we try to make sure that the top two base calls
            % with highest intensities are the same;
            % the difference of intensity between the highest and second
            % highest intensity is smaller than second highest and third
            % highest for both sense and antisense;
            % the overall converted data at given position has to be less than 0
            
            if isempty(setdiff(b(1:2),b1(1:2))) & fdata(i,index1(j))<0
                
                % follow the rule in paper, we need to speed up by only
                % consider calls different from haploid model and diploid
                % model
                index_sample=find(check_index(:,1)==index_good(index1(j)));
                index_data=find(check_index(:,2)==i);
                overlap_index=intersect(index_sample,index_data);
                if isempty(overlap_index)==0
                    % I want to estimate heteroplasmy ratio
                    t1=temp_s.*temp_a;
                    [ta,tb]=sort(t1);
                    r=sqrt((ta(4)/ta(3)));
                    
                    if i<=3106-12 % the 3107 deletion!
                        pos=i+12;
                    else
                        pos=i+13;
                    end
                    
                    % now here is some tricky term, if you want to be more
                    % conservative, you will use evidence from both strand,
                    % or you will use evidence from single strand to give
                    % more hints
                    if 1==1
                        if ((a(1)-a(2))<(a(2)-a(3)) & (a1(1)-a1(2))< (a1(2)-a1(3)))
                            idlist_h(index_good(index1(j))).hetero =unique([idlist_h(index_good(index1(j))).hetero pos]);
                            finalcheck=[finalcheck;index_good(index1(j)) pos tb(4)];
                            fprintf(fid,'%1s\t%1i\t%1s\t%1s\t%6.4f\t%1s\t%6.4f\t%1s\n',idlist_h(index_good(index1(j))).id,pos,refseq(pos),base(tb(4)),r/(1+r)*100,base(tb(3)),1/(1+r)*100,'double strands evidence');
                        else
                            if ((a(1)-a(2))<(a(2)-a(3)) | (a1(1)-a1(2))< (a1(2)-a1(3)))
                                idlist_h(index_good(index1(j))).hetero =unique([idlist_h(index_good(index1(j))).hetero pos]);
                                finalcheck=[finalcheck;index_good(index1(j)) pos tb(4)];
                                fprintf(fid,'%1s\t%1i\t%1s\t%1s\t%6.4f\t%1s\t%6.4f\t%1s\n',idlist_h(index_good(index1(j))).id,pos,refseq(pos),base(tb(4)),r/(1+r)*100,base(tb(3)),1/(1+r)*100,'single strands evidence');
                            end
                        end
                    end
                end
            end
        end
    end
end
fclose(fid);

function snpcall=improvebasecall(bc,qc,intensity_data,idlist_h,finalcheck, dummie,refseq)

% improvebasecall - using the method proposed in the paper to convert none
% calls to base call

% snpcall=improvebasecall(bc,qc,intensity_data,n,idlist_h,finalcheck, dummie,refseq)
%
% Result:
%     snpcall: newly identified base calls
%
% Parameters:
%     bc: basecall from GSEQ
%     qc: quality score from GSEQ
%     intensity_data: intensity data from GSEQ
%     n: number of samples
%     idlist_h: id list
%     finalcheck: list of heteroplasmy site
%     dummie: annotation from original site
%     refseq: reference sequence

% Created for Matlab R14
% version 0.9 beta (April, 2011)
% Mike Xie
% email: hongbo.michael.xie@gmail.com
%
% File history:
%   0.1 (March, 2010) - created
%   0.9 beta (April, 2011) - modified version

n=length(idlist_h);
for i=1:n
    snpcall{i}=[];
end

pos_list=[13:3106 3108:16544+13]; % this is mitoposition for each of the mitoprobe
comnvarf=[60:349 738:763 1426:1450 4757:4781 8848:8872 15314:15338 16057:16374];

% open files for output
nfiles=length(idlist_h);
fidt=fopen('chplist.txt','w');
for i=1:nfiles
    fileid=[idlist_h(i).id '_update.CHP'];
    fid(i)=fopen(fileid,'w');
    fprintf(fid(i),'%1s\n%1s\t%1s\t%1s\t%1s\t%1s\t%1s\t%1s\t%1s\n','Resequence Analysis','index','Fragment','Frag Pos','Tiling Pos','Ref','Heterozygosity',[idlist_h(i).id '_Call'],[idlist_h(i).id '_Quality'] );
    fprintf(fidt,'%1s\n',fileid);
end
fclose(fidt);

fidsnp=fopen('snpcalls.txt','w');
nuct='acgt';
st=0;
thrsh=0.2352; %1.265 %% if exp(thrsh); 0.2352 0.3576
%list newly identified snps
fidcmp=fopen('newfindings.txt','w');

for j=1:length(pos_list)
    
    datah=[];
    qualh=[];
    stt=0;
    refcall=refseq(pos_list(j));
    
    for i=1:nfiles    
        calli=bc(j,i);
        calli_qual=qc(j,i);
        % revert any calls with quality score less than 12 to be no-call
        if strcmp(calli,refcall)==0 & calli_qual<12
            calli='n';
        end
        
        %%% see if it belongs to addtional probes by
        index_germ=intersect(comnvarf,pos_list(j));
        
        if isempty(index_germ)==0
            indexsnp=find(idlist_h(i).snp==pos_list(j));
            if isempty(indexsnp)
                calli=refcall;
            else
                calli=idlist_h(i).snpcall(indexsnp);
            end
        end
        
        %%% ok otherwise we have to use intensity calls to make it
        temp_s1=intensity_data(j,i*8-7:i*8-4)/min(intensity_data(j,i*8-7:i*8-4));
        temp_a=intensity_data(j,i*8-3:i*8)./min(intensity_data(j,i*8-3:i*8));
        temp_s=temp_s1([4 3 2 1]);
        
        
        temp_t=sqrt(temp_a.*temp_s);
        [a,b]=sort(temp_t,'descend');
        kn=log(a(1)/a(2));
        
        %find if it belongs to heteroplasmy sites
        index_germ=intersect(find(finalcheck(:,1)==i),find(finalcheck(:,2)==pos_list(j)));
        
        if isempty(index_germ)==0    
              calli=nuct(finalcheck(index_germ,3));
        end
        
        basepos=pos_list(j);
        
        if strcmp(calli,'n') & kn>=thrsh
            calli=lower(nuct(b(1)));
            if strcmp(refcall,calli)==0
                fprintf(fidcmp,'%1s\t%1i\t%1s\t%1s\t%6.4f\n',idlist_h(i).id,basepos,refcall,calli,calli_qual);
            end
        end
        
        fprintf(fid(i),'%1s\t%1s\t%6.4f\n',dummie{j},calli,calli_qual);
        
        if strcmp(refcall,calli)==0 & strcmp(calli,'n')==0
            fprintf(fidsnp,'%1s\t%1i\t%1s\t%1s\t%6.4f\n',idlist_h(i).id,basepos,refcall,calli,calli_qual);
            snpcall{i}=[snpcall{i} basepos];
        end
    end
end


fclose(fidsnp);
fclose(fidcmp);

for i=1:nfiles
    fclose(fid(i));
end

function output(qc)
% output - output results

% qualitycontrol(qc,idlist_h)

% Parameters:
%     qc: quality score from haploid model

% Created for Matlab R14
% version 0.9 beta (April, 2011)
% Mike Xie
% email: hongbo.michael.xie@gmail.com
%
% File history:
%   0.1 (March, 2010) - created
%   0.9 beta (April, 2011) - modified version

data_gseq=qc(1:16544,:);
pct_gseq=[];
[c,d]=size(data_gseq);
for i=1:d
    pct_gseq=[pct_gseq length(find(data_gseq(:,i)>=3))/16544*100];
end



fid=fopen('chplist.txt','r');
[gene,st,ed]=textread('genelist1.txt','%s %u %u');
finalcall=[];

totalmiss=[];
stt=0;
id=[];
while feof(fid)==0
    line=fgetl(fid);
    stt=stt+1;
    id{stt}=line;
    [pos,ref,cal,qual]=textread(line,'%*u %*s %u %*u %s %*s %s %s','delimiter',char(9),'headerlines',2);
    index=find(pos>3106);
    pos(index)=pos(index)+1;
    cal_temp=[];
    miss=zeros(length(gene),1);
    for i=1:length(cal)
        cal_temp=[cal_temp;cal{i}];
    end
    totalmiss=[totalmiss;100-length(find(cal_temp=='n'))/length(cal_temp)*100];
    for i=1:length(gene)
        index=find(pos>=st(i)-12 & pos<=ed(i)-12);
        for j=1:length(index)
            if strcmp(cal{index(j)},'n')
                miss(i)=miss(i)+1;
            end
        end
        miss(i)=100-miss(i)/length(index)*100;
    end
    finalcall=[finalcall miss];
end
fclose(fid);

fid=fopen('summaryresult.txt','w');
fprintf(fid,'%1s\t','gene');
for i=1:length(id)-1
    fprintf(fid,'%1s\t',id{i});
end
fprintf(fid,'%1s\n',id{length(id)});

fprintf(fid,'%1s\t','raw_totalpct');
for i=1:length(id)-1
    fprintf(fid,'%6.4f\t',pct_gseq(i));
end
fprintf(fid,'%6.4f\n',pct_gseq(length(id)));

fprintf(fid,'%1s\t','totalpct');
for i=1:length(id)-1
    fprintf(fid,'%6.4f\t',totalmiss(i));
end
fprintf(fid,'%6.4f\n',totalmiss(length(id)));

for i=1:length(gene)
    fprintf(fid,'%1s\t',gene{i});
    for j=1:length(id)-1
        fprintf(fid,'%6.4f\t',finalcall(i,j));
    end
    fprintf(fid,'%6.4f\n',finalcall(i,j));
end
fclose(fid);

function [header,seq]=readfasta(txtfile)

fid=fopen(txtfile,'rt');
header=fgetl(fid);
seq=[];
while feof(fid)==0
    seq=[seq fgetl(fid)];
end
%strrep(seq,'//','');
fclose(fid);




function [bc,qc,tilepos, idlist,qci,dummie]=readbasecall(filename)
% readbasecall - read snp calls from GSEQ output

% [bc,qc,tilepos, idlist,qci,dummie]=readbasecall(filename)
%
% Result:
%     bc: basce call from GSEQ
%     qc: quality scores
%     qci: index column for quality score readings
%     idlist: store the id list of samples, this will also be the main
%     var to store results
%     tilepos: Tiling Position
%     dummie: annotation for future output. ( I am lazy here)
%
% Parameters:
%     filename: input file name

% Created for Matlab R14
% version 0.9 beta (April, 2011)
% Mike Xie
% email: hongbo.michael.xie@gmail.com
%
% File history:
%   0.1 (March, 2010) - created
%   0.9 beta (April, 2011) - modified version

fid=fopen(filename,'r');

% get rid of the first line of file (junk)
line=fgetl(fid);

%get the second line of data to find out where to look
line=fgetl(fid);
index=[findstr(line,char(9)) length(line)+1];

%base call index for each sample
bci=[];

%call quality column index for each sample
qci=[];

st=0;
for i=1:length(index)-1
    % identify position of base call column
    if findstr(line(index(i)+1:index(i+1)-1),'_Call')
        bci=[bci i];
        st=st+1;
        idlist(st).id=line(index(i)+1:index(i+1)-6);
        
    end
    % identify position of call quality column
    if findstr(line(index(i)+1:index(i+1)-1),'_Quality')
        qci=[qci i];
    end
    
    % identify tiling position in order to find out where to check
    if findstr(line(index(i)+1:index(i+1)-1),'Tiling Pos')
        tileposi=i;
    end
    
end

% all base cals
bc=[];

% call quality score
qc=zeros(36942,length(idlist));

tilepos=[];
st=0;
dummie_index=min(bci);
while feof(fid)==0
    % start to process data
    line=fgetl(fid);
    
    if isempty(findstr(line,'AFFX')) % skip affy control probe
        st=st+1;
        index=[findstr(line,char(9)) length(line)+1];
        dummie{st}=line(1:index(dummie_index)-1);
        % set some dummie thing to take the readings
        bctemp='';
        qctemp=[];
        
        for i=1:length(bci)
            % accumulate quality score and base call
            bctemp=[bctemp line(index(bci(i))+1:index(bci(i)+1)-1)];
            qctemp=[qctemp str2num(line(index(qci(i))+1:index(qci(i)+1)-1))];
        end
        bc=[bc;bctemp];
        qc(st,:)=qctemp;
        tilepos=[tilepos;str2num(line(index(tileposi)+1:index(tileposi+1)-1))];
    end
end

fclose(fid);