Revise code for calculation of MFCC's

Issue #492
QutEcoacoustics · May 27, 2021 · d53bf26 · d53bf26
1 parent fcb0a12
commit d53bf26
Show file tree

Hide file tree

Showing 3 changed files with 134 additions and 119 deletions.
diff --git a/src/AnalysisPrograms/SpectrogramGenerator/SpectrogramGenerator.Core.cs b/src/AnalysisPrograms/SpectrogramGenerator/SpectrogramGenerator.Core.cs
@@ -76,8 +76,12 @@ public static AudioToSonogramResult GenerateSpectrogramImages(
             // Default noiseReductionType = Standard
             var bgNoiseThreshold = config.BgNoiseThreshold;
 
+            // set pre-emphasis to the default value false.
+            bool doPreemphasis = false;
+
             // EXTRACT ENVELOPE and SPECTROGRAM FROM RECORDING SEGMENT
-            var dspOutput1 = DSP_Frames.ExtractEnvelopeAndFfts(recordingSegment, frameSize, frameStep);
+            // The output from this call to ExtractEnvelopeAndFfts is used only for standard spectrograms.
+            var dspOutput1 = DSP_Frames.ExtractEnvelopeAndFfts(recordingSegment, doPreemphasis, frameSize, frameStep);
 
             // This constructor initializes default values for Melscale and Mfcc spectrograms and other parameters.
             var sonoConfig = new SonogramConfig()
@@ -211,9 +215,13 @@ public static AudioToSonogramResult GenerateSpectrogramImages(
             // IMAGE 7) Cepstral Spectrogram
             if (@do.Contains(CepstralSpectrogram))
             {
-                // TODO at present noise reduction type must be set = Standard.
-                // ... but use the NoiseReductionParameter that is set in the config file.
-                //sonoConfig.NoiseReductionParameter = 0.0;
+                // The cepstrogram requires additional config settings. Cannot use previous spectrograms.
+                // Set up the config file.
+                // Use some defaults and get other parameters from config file.
+                sonoConfig.DoPreemphasis = config.DoPreemphasis;
+
+                // TODO CHECK IF THERE IS A NEED FOR NOISE REDUCTION
+                sonoConfig.NoiseReductionParameter = 0.0;
                 sonoConfig.NoiseReductionType = NoiseReductionType.Standard;
 
                 sonoConfig.mfccConfig.DoMelScale = true;
@@ -223,9 +231,7 @@ public static AudioToSonogramResult GenerateSpectrogramImages(
                 sonoConfig.mfccConfig.CcCount = 12;
                 sonoConfig.mfccConfig.IncludeDelta = config.IncludeDelta;
                 sonoConfig.mfccConfig.IncludeDoubleDelta = config.IncludeDoubleDelta;
-                images.Add(
-                    CepstralSpectrogram,
-                    GetCepstrogram(sonoConfig, recordingSegment, sourceRecordingName));
+                images.Add(CepstralSpectrogram, GetCepstrogram(sonoConfig, recordingSegment, sourceRecordingName));
             }
 
             // IMAGE 8) Octave-frequency scale Spectrogram
@@ -445,20 +451,26 @@ public static Image<Rgb24> GetMelScaleSpectrogram(
             return image;
         }
 
+        /// <summary>
+        /// Returns a cepstrogram image.
+        /// </summary>
         public static Image<Rgb24> GetCepstrogram(
-            SonogramConfig sonoConfig,
+            SonogramConfig config,
             AudioRecording recording,
             string sourceRecordingName)
         {
-            var cepstrogram = new SpectrogramCepstral(sonoConfig, recording.WavReader);
+            // Get the cepstrogram
+            var cepstrogram = new SpectrogramCepstral(config, recording.WavReader);
+
+            // Now prepare it as an image.
             var image = cepstrogram.GetImage();
             var titleBar = BaseSonogram.DrawTitleBarOfGrayScaleSpectrogram(
                     "CEPSTROGRAM " + sourceRecordingName,
                     image.Width,
                     ImageTags[CepstralSpectrogram]);
             var startTime = TimeSpan.Zero;
             var xAxisTicInterval = TimeSpan.FromSeconds(1);
-            TimeSpan xAxisPixelDuration = TimeSpan.FromSeconds(sonoConfig.WindowStep / (double)sonoConfig.SampleRate);
+            TimeSpan xAxisPixelDuration = TimeSpan.FromSeconds(config.WindowStep / (double)config.SampleRate);
             var labelInterval = TimeSpan.FromSeconds(5);
             image = BaseSonogram.FrameSonogram(image, titleBar, startTime, xAxisTicInterval, xAxisPixelDuration, labelInterval);
             return image;

diff --git a/src/AudioAnalysisTools/DSP/MFCCStuff.cs b/src/AudioAnalysisTools/DSP/MFCCStuff.cs
@@ -10,7 +10,7 @@ namespace AudioAnalysisTools.DSP
     public class MFCCStuff
     {
         /// <summary>
-        /// Converts spectral amplitudes directly to dB, normalising for window power and sample rate.
+        /// Converts amplitude spectra (in a spectrogram) directly to dB spectra, normalising for window power and sample rate.
         /// NOTE 1: The window contributes power to the signal which must subsequently be removed from the spectral power.
         /// NOTE 2: Spectral power must be normalised for sample rate. Effectively calculate freq power per sample.
         /// NOTE 3: The power in all freq bins except f=0 must be doubled because the power spectrum is an even function about f=0;
@@ -81,6 +81,80 @@ public class MFCCStuff
             return spectra;
         }
 
+        /// <summary>
+        /// This method is similar to the above DecibelSpectra() method,
+        /// except that the passed spectrogram matrix contains energy values, i.e. squared amplitude values.
+        /// This method is used when calculating mfcc's. The passed energy spectrogram is output from the mel-frequency filter bank,
+        /// and the energy values are converted directly to log-energy, normalising for window power and sample rate.
+        /// Note that the output is log-energy, not decibels: decibels =  10 * log-energy
+        /// NOTE 1: The window contributes power to the signal which must subsequently be removed from the spectral power.
+        /// NOTE 2: Spectral power must be normalised for sample rate. Effectively calculate freq power per sample.
+        /// NOTE 3: The power in all freq bins except f=0 must be doubled because the power spectrum is an even function about f=0;
+        ///         This is due to the fact that the spectrum actually consists of 512 + 1 values, the centre value being for f=0.
+        /// NOTE 5: THIS METHOD ASSUMES THAT THE LAST BIN IS THE NYQUIST FREQ BIN
+        /// NOTE 6: THIS METHOD ASSUMES THAT THE FIRST BIN IS THE MEAN or DC FREQ BIN.
+        /// </summary>
+        /// <param name="energyM"> the amplitude spectra. </param>
+        /// <param name="windowPower">value for window power normalisation.</param>
+        /// <param name="sampleRate">to NormaliseMatrixValues for the sampling rate.</param>
+        /// <param name="epsilon">small value to avoid log of zero.</param>
+        /// <returns>a spectrogram of decibel values.</returns>
+        public static double[,] GetLogOfEnergySpectrogram(double[,] energyM, double windowPower, int sampleRate, double epsilon)
+        {
+            int frameCount = energyM.GetLength(0);
+            int binCount = energyM.GetLength(1);
+            double minLogEnergy = Math.Log10(epsilon / windowPower / sampleRate);
+            double minLogEnergy2 = Math.Log10(epsilon * 2 / windowPower / sampleRate);
+
+            double[,] spectra = new double[frameCount, binCount];
+
+            //calculate power of the DC value - first column of matrix
+            for (int i = 0; i < frameCount; i++)
+            {
+                if (energyM[i, 0] < epsilon)
+                {
+                    spectra[i, 0] = minLogEnergy;
+                }
+                else
+                {
+                    spectra[i, 0] = Math.Log10(energyM[i, 0] / windowPower / sampleRate);
+                }
+            }
+
+            // calculate power in frequency bins - must multiply by 2 to accomodate two spectral components, ie positive and neg freq.
+            for (int j = 1; j < binCount - 1; j++)
+            {
+                // foreach time step or frame
+                for (int i = 0; i < frameCount; i++)
+                {
+                    if (energyM[i, j] < epsilon)
+                    {
+                        spectra[i, j] = minLogEnergy2;
+                    }
+                    else
+                    {
+                        spectra[i, j] = Math.Log10(energyM[i, j] * 2 / windowPower / sampleRate);
+                    }
+                }
+            } //end of all freq bins
+
+            //calculate power of the Nyquist freq bin - last column of matrix
+            for (int i = 0; i < frameCount; i++)
+            {
+                //calculate power of the DC value
+                if (energyM[i, binCount - 1] < epsilon)
+                {
+                    spectra[i, binCount - 1] = minLogEnergy;
+                }
+                else
+                {
+                    spectra[i, binCount - 1] = Math.Log10(energyM[i, binCount - 1] / windowPower / sampleRate);
+                }
+            }
+
+            return spectra;
+        }
+
         public static int[] VocalizationDetection(double[] decibels, double lowerDbThreshold, double upperDbThreshold, int k1k2delay, int syllableGap, int minPulse, int[] zeroCrossings)
         {
             int length = decibels.Length;
@@ -421,7 +495,7 @@ public static double InverseHerzTranform(double m, double c, double div)
         /// Uses Greg's MelIntegral
         /// The first step is to calculate the number of filters for the required frequency sub-band.
         /// </summary>
-        /// <param name="matrix">the sonogram.</param>
+        /// <param name="matrix">the spectrogram.</param>
         /// <param name="filterBankCount">number of filters over full freq range 0 Hz - Nyquist.</param>
         /// <param name="nyquist">max frequency in original spectra.</param>
         /// <param name="minFreq">min freq in the passed sonogram matrix.</param>
@@ -675,10 +749,15 @@ public static double[] DCT(double[] spectrum, double[,] cosines)
         //*********************************************** GET ACOUSTIC VECTORS
 
         /// <summary>
-        /// This method assumes that the supplied mfcc matrix DOES NOT contain dB values in column one.
-        /// These are added in from the supplied dB array.
+        /// This method assumes that the supplied mfcc matrix DOES NOT contain frame dB (log energy) values in column zero.
+        /// These are added in from the supplied array of frame log-energies.
         /// </summary>
-        public static double[,] AcousticVectors(double[,] mfcc, double[] dBNormed, bool includeDelta, bool includeDoubleDelta)
+        /// <param name="mfcc">A matrix of mfcc coefficients. Column zero is empty.</param>
+        /// <param name="frameDbNormed">log-energy values for the frames.</param>
+        /// <param name="includeDelta">Whether or not to add delta features.</param>
+        /// <param name="includeDoubleDelta">Whether or not to add double delta features.</param>
+        /// <returns>A matrix of complete mfcc values with additional deltas, frame energies etc.</returns>
+        public static double[,] AcousticVectors(double[,] mfcc, double[] frameDbNormed, bool includeDelta, bool includeDoubleDelta)
         {
             //both the matrix of mfcc's and the array of decibels have been normed in 0-1.
             int frameCount = mfcc.GetLength(0); //number of time frames
@@ -701,7 +780,7 @@ public static double[] DCT(double[] spectrum, double[,] cosines)
             // loop through the time frames and create feature vector for each frame.
             for (int t = 0; t < frameCount; t++)
             {
-                double[] fv = GetMfccFeatureVector(dBNormed, mfcc, t, includeDelta, includeDoubleDelta); //get feature vector for frame (t)
+                double[] fv = GetMfccFeatureVector(frameDbNormed, mfcc, t, includeDelta, includeDoubleDelta); //get feature vector for frame (t)
 
                 //transfer feature vector to the matrix of acoustic features.
                 for (int i = 0; i < dim; i++)