function [f0, tt] = track_f0(wave, ws, voicingThresh, silenceThresh) % Function [f0, tt] = track_f0(wave, ws, voicingThresh, silenceThresh) use % adaptive least squares (ALS) algorithm to find the fundamental frequency of wave at % different times. % % Input % wave: the sound wave data (assuming a sampling frequency of 44100Hz) % ws: analysis window size (in seconds). A good choice is 0.04s % voicingThresh: cost must be below this threshold to be considered as % a good sinusoidal fit. For Edinburgh data, it was chosen at % 0.0825 % silenceThresh: energy of the signal must be above this threshold to % be considered as voiced region. For Edinbugh data, it was % chosen at 1e2. The silenceThresh is more or less determined by % the energy level, therefore should be scaled accordingly to the % data being examined. % % % Returns: % f0: the estimated fundamental frequency % tt: the time stamp (in seconds) when the fundamental frequency is % estimated % % ALS algorithm: % http://www.cis.upenn.edu/~lsaul/papers/voice_nips02.pdf % % written by {feisha, burgoyne}@cis.upenn.edu, adapted from L. Saul's code. % % SAMPLING FREQUENCY AND DECIMATION RATE (USED TO SPEED UP PROCESSING) fs = 44100; decrate = 12; % fs = 22050; decrate = 6; % Alternative params settings sr = fs / decrate; % new sampling rate after decimation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % FILTERS persistent bf af fMin fMax nBand bl; % CLEAR ALL IF YOUR SAMPLING RATE IS % CHANGED EXTERNALLY if (isempty(bf)) if rem(fs, decrate) ~=0 error('The sampling rate is NOT the multiple of the decimation rate!'); end % fMin = [50 71 100 141 200 283 400 533]; fMax = [75 107 150 212 300 425 600 800]; % nBand = length(fMin); order = 4; ripple = 0.5; bf = zeros(nBand,2*order+1); af = zeros(nBand,2*order+1); for band=1:nBand passBand = [fMin(band)/2 fMax(band)]/(sr/2); [bf(band,:),af(band,:)] = cheby1(order,ripple,passBand); end; bl = fir1(50, 1000/(fs/2)); % LOW PASS FOR DECIMATION end; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % TRACK wave = reshape(wave,1,length(wave)); envA = filtfilt(bl, 1, wave); envA = envA(1:decrate:end); envB = max(0, envA); nFrame = length(envB); freqs = zeros(nBand,nFrame); costs = zeros(nBand,nFrame); for band=1:nBand sineWave = filtfilt(bf(band,:),af(band,:),envB); [freqs(band,:),costs(band,:)] = ... TrackBand(sineWave,sr, ws,fMin(band)); end; [cost,indx] = min(costs,[],1); % PITCH pitch = zeros(size(cost)); for band=1:nBand pitch(indx==band) = freqs(band,indx==band); end; % VOLUME volume = filter(ones(1,decrate)/decrate,1,wave.*wave); volume = volume(1:decrate:end); % VOICED/UNVOICED DETERMINATION voiced = find(costsilenceThresh); f0 = zeros(size(pitch)); f0(voiced) = pitch(voiced); % ALS fitting uses preceding wave data to compose an analysis window % therefore, we shift half of the window and interpolate tt = [0:nFrame-1]/sr- ws/2; uu = [0:nFrame-1]/sr; f0 = interp1(tt,f0,uu,'nearest'); tt = uu; return; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Fit wave data to a sinusoid, returning estimated frequencies and % heuristic fitting costs function [f0,cost] = TrackBand(xx,samplingRate, windowSize,minF0) % SINUSOIDAL FIT nn = ceil(windowSize*samplingRate); % ANALYSIS WINDOW SIZE mm = [2*xx(1) xx(1:end-2)+xx(3:end) 2*xx(end)]; % XX(n-1) + XX(n+1) xm = cumsum(xx.*mm); m2 = cumsum(mm.*mm); x2 = cumsum(xx.*xx); xm = xm-[zeros(1,nn) xm(1:end-nn)]; % NUMERATOR OF EQ.(4) m2 = m2-[zeros(1,nn) m2(1:end-nn)]; % DENOMINATOR OF EQ.(4) x2 = x2-[zeros(1,nn) x2(1:end-nn)]; % XX(n) squared in the window aa = xm./(m2+realmin); aa(m2==0) = 0.5; % BELOW MINIMUM FREQUENCY? minP = 2*pi*minF0/samplingRate; aa(abs(aa)<0.5/cos(minP)) = 0.5; % PITCH f0 = acos(0.5./aa) * samplingRate / (2*pi); f0 = min(f0,samplingRate-f0); pp = 2*pi*f0./samplingRate; sp = sin(pp); cp = cos(pp); % COST cost = sqrt(abs(x2+aa.*aa.*m2-2*aa.*xm)./abs(m2+realmin)); cost = cost.*(cp.*cp*samplingRate)./(pi*sp+realmin); cost = cost./(f0+realmin); cost(f0==0) = inf; cost(m2==0) = inf; cost(x2==0) = inf; return;