% % ELEN E4810 - Digital Signal Processing % % Course Project - Removing vocals from commercial tracks % % Instructor: Prof. Dan Ellis (dpwe@columbia.edu) % % Authors: Jaime Peretzman (jp2642), Shrivathsa Bhargav (sb2784) % % Columbia University, Fall 2007 display(' '); display(' '); display('----------------------------------------------------------'); display(' Removing vocals from commercial tracks'); display('----------------------------------------------------------'); display(' '); display('Welcome to the ELEN E4810 - DSP Project'); display('By Jaime Peretzman and Shrivathsa Bhargav'); display('Columbia University, Fall 2007'); display(' '); display('Reading in wave file'); %Clip from The Beatles - Let It Be [sam,sr]=wavread('let_it_be.wav'); display('>>PRESS ENTER to hear original clip'); pause; soundsc(sam,sr); %Original Stereo Clip pause(10); info_channels = size(sam); display('Splitting stereo channels'); %Split the channels n = 1:info_channels(1,1); sam_l(n) = sam(n,1); %Left Channel sam_r(n) = sam(n,2); %Right Channel display('>>PRESS ENTER to plot the two channels'); pause; figure(1); subplot(2,1,1); plot(sam_l); title('Left Channel'); xlabel ('Number of Samples'); ylabel ('Normalized Amplitude'); axis ([0 500000 -1 1]) subplot(2,1,2); plot(sam_r); title('Right Channel'); xlabel ('Number of Samples'); ylabel ('Normalized Amplitude'); axis ([0 500000 -1 1]) display(' '); display('Bandstop filtering'); %BANDSTOP FILTERING %Find a good filter window wh = transpose(hanning(length(sam_l))); %Cutoff frequency, bandstop left f_cl = 300; w_cl = f_cl/(sr*0.5); %Cutoff frequency, bandstop right f_cr = 2500; w_cr = f_cr/(sr*0.5); display('>>PRESS ENTER to see the bandstop filter'); pause; display('Plotting bandstop filter characteristics'); %Create the bandstop filter of order 1000 (!) b_stop = fir1(1000, [w_cl, w_cr], 'stop'); freqz(b_stop); title('Filter characteristics'); display('>>PRESS ENTER to see and hear the bandstop-filtered sample'); pause; display('Plotting filtered sample'); filtered1 = conv(sam_l,b_stop); soundsc(filtered1,sr); freqz(filtered1); pause(10); display(' '); display('Stereo cancellation'); sam_no_vox = sam_l - sam_r; sam_l_fft = fft(sam_l); sam_r_fft = fft(sam_r); sam_no_vox_fft = sam_l_fft - sam_r_fft; sam_no_vox = ifft(sam_no_vox_fft); display('>>PRESS ENTER to see and hear the stereo-cancelled sample'); pause; display('Plotting stereo cancelled sample'); soundsc(sam_no_vox,sr); subplot(2,1,1); plot(n./sr,sam_l); title('Original left channel'); ylabel('Amplitude') xlabel('Time') subplot(2,1,2); plot(n./sr,sam_no_vox); title('Stereo cancelled left channel'); ylabel('Amplitude') xlabel('Time') pause(10); display(' '); display('Audio Blind Source Separation'); % fft_points is the number of samples per frame. This has to be large in % order to create enough points to mask in our spectogram fft_points = 8192; %This is the size of the window, in our case we use a hamming window, but %it would yield better results a blackman window window_size = 8192; %This is the offset between frames. This has to be small enough to get %more points in our spectogram but not cause noise in our final result. frame_size = fft_points./4; display('Performing Short-Time Fourier Transform (STFT)'); %%%STFT OF THE LEFT CHANNEL%%% sam_l_stft = stft(sam_l, fft_points, window_size, frame_size); %%%STFT OF THE RIGHT CHANNEL%%% sam_r_stft = stft(sam_r, fft_points, window_size, frame_size); %%%size(sam_l_stft)=(rows=4097 columns=236) In our diagram %Comparison between left and right channel through %DFT(left channel)/DFT(right channel) div_dft = sam_l_stft./sam_r_stft; mod_div_dft = (abs(div_dft)); %The modulus is gotten in order to get rid of %complex numbers and just have magnitudes to compare. display('>>PRESS ENTER to see spectrogram before masking vocals'); pause; display('Plotting spectrogram before masking vocals, and') %Spectogram before masking the vocals % figure(2) figure(1); subplot(2,1,1); imagesc(20*log10(abs(sam_l_stft))); %plots the spectogram title('Left Channel Spectogram - Before Masking'); ylabel('Frequency (Hz)') xlabel('Number of Frames') axis xy; caxis([-60 0]); colorbar; % provides decode of colors subplot(2,1,2); imagesc(20*log10(abs(sam_r_stft))); title('Right Channel Spectogram - Before Masking'); ylabel('Frequency (Hz)') xlabel('Number of Frames') axis xy; caxis([-60 0]); colorbar; display('Identifying masking coefficients, this could take a minute...'); A=0; a=0; b=a+0.005; i=1; %This while loop finds the number of DFT(left)/DFT(right) coefficients %inbetween fixed ranges. (e.g. between 0.995 and 1.000) while (b<2) A(i) = mean(sum(mod_div_dft>a & mod_div_dft>PRESS ENTER to see frequency of DFT ratios'); pause; %In this graph, it can be seen that the peak happens around a ratio of 1. %Extracting the coefficients around 1 will let us extract any mono track %this peak represents. % figure(3) a=0:0.005:1.995; figure(2); plot(a,A) title('Frequency of DFT ratios in the spectogram'); xlabel('Ratio between DFT in Left Channel and DFT in Right Channel') ylabel('Frequency of Coefficients in the whole array') display('Performing Binary Time Frequency Mask'); %This is a binary Time Frequency Mask (BTFM). This loop will search %throughout the whole spectogram for DFT(left)/DFT(right) inbetween 0.65 %and 1.35. In case it finds a value, it will be set the DFT coefficients of % both left and right channel to zero, otherwise the coefficients are not % changed at all for k=1:236; for m=1:4097; if (mod_div_dft(m,k)>0.65 & mod_div_dft(m,k)<1.35); sam_l_stft (m,k)=0; sam_r_stft (m,k)=0; m=m+1; else m=m+1; end end k=k+1; end display('>>PRESS ENTER to see spectrogram after masking vocals'); pause; %Spectogram with mask applied % figure(4) figure(2); subplot(2,1,1); imagesc(20*log10(abs(sam_l_stft))); %plots the spectogram title('Left Channel Spectogram - After Masking'); ylabel('Frequency (Hz)') xlabel('Number of Frames') axis xy; caxis([-60 0]); colorbar; % provides decode of colors subplot(2,1,2); imagesc(20*log10(abs(sam_r_stft))); title('Right Channel Spectogram - After Masking'); ylabel('Frequency (Hz)') xlabel('Number of Frames') axis xy; caxis([-60 0]); colorbar; display('Performing inverse-STFT'); %Inverse STFT sam_l_istft = istft(sam_l_stft, fft_points, window_size, frame_size); sam_r_istft = istft(sam_r_stft, fft_points, window_size, frame_size); display('>>PRESS ENTER to see original channels vs recovered channels'); pause; %Original Channels vs Recovered Channels % figure(5); figure(1); subplot(2,1,1); plot(sam_l); title('Left Channel'); xlabel ('Number of Samples'); ylabel ('Normalized Amplitude'); axis ([0 500000 -1 1]) subplot(2,1,2); plot(sam_l_istft); title('Recovered Left Channel'); xlabel ('number of Samples'); ylabel ('Normalized Amplitude'); axis ([0 500000 -1 1]); figure(2); subplot(2,1,1); plot(sam_r); title('Right Channel'); xlabel ('Number of Samples'); ylabel ('Normalized Amplitude'); axis ([0 500000 -1 1]) subplot(2,1,2); plot(sam_r_istft); title('Recovered Right Channel'); xlabel ('number of Samples'); ylabel ('Normalized Amplitude'); axis ([0 500000 -1 1]) display('>>PRESS ENTER to hear recovered sample'); pause; %display('Playing original clip'); %soundsc(sam,sr); display('Playing recovered sample'); sam_recovered=[sam_l_istft;sam_r_istft]; soundsc(sam_recovered.',sr); display(' '); display('Done! Thanks for listening. Goodbye!'); display(' ');