Skip to content

Commit

Permalink
Revise code for calculation of MFCC's
Browse files Browse the repository at this point in the history
Issue #492
  • Loading branch information
towsey committed May 27, 2021
1 parent fcb0a12 commit d53bf26
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 119 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,12 @@ public static AudioToSonogramResult GenerateSpectrogramImages(
// Default noiseReductionType = Standard
var bgNoiseThreshold = config.BgNoiseThreshold;

// set pre-emphasis to the default value false.
bool doPreemphasis = false;

// EXTRACT ENVELOPE and SPECTROGRAM FROM RECORDING SEGMENT
var dspOutput1 = DSP_Frames.ExtractEnvelopeAndFfts(recordingSegment, frameSize, frameStep);
// The output from this call to ExtractEnvelopeAndFfts is used only for standard spectrograms.
var dspOutput1 = DSP_Frames.ExtractEnvelopeAndFfts(recordingSegment, doPreemphasis, frameSize, frameStep);

// This constructor initializes default values for Melscale and Mfcc spectrograms and other parameters.
var sonoConfig = new SonogramConfig()
Expand Down Expand Up @@ -211,9 +215,13 @@ public static AudioToSonogramResult GenerateSpectrogramImages(
// IMAGE 7) Cepstral Spectrogram
if (@do.Contains(CepstralSpectrogram))
{
// TODO at present noise reduction type must be set = Standard.
// ... but use the NoiseReductionParameter that is set in the config file.
//sonoConfig.NoiseReductionParameter = 0.0;
// The cepstrogram requires additional config settings. Cannot use previous spectrograms.
// Set up the config file.
// Use some defaults and get other parameters from config file.
sonoConfig.DoPreemphasis = config.DoPreemphasis;

// TODO CHECK IF THERE IS A NEED FOR NOISE REDUCTION
sonoConfig.NoiseReductionParameter = 0.0;
sonoConfig.NoiseReductionType = NoiseReductionType.Standard;

sonoConfig.mfccConfig.DoMelScale = true;
Expand All @@ -223,9 +231,7 @@ public static AudioToSonogramResult GenerateSpectrogramImages(
sonoConfig.mfccConfig.CcCount = 12;
sonoConfig.mfccConfig.IncludeDelta = config.IncludeDelta;
sonoConfig.mfccConfig.IncludeDoubleDelta = config.IncludeDoubleDelta;
images.Add(
CepstralSpectrogram,
GetCepstrogram(sonoConfig, recordingSegment, sourceRecordingName));
images.Add(CepstralSpectrogram, GetCepstrogram(sonoConfig, recordingSegment, sourceRecordingName));
}

// IMAGE 8) Octave-frequency scale Spectrogram
Expand Down Expand Up @@ -445,20 +451,26 @@ public static Image<Rgb24> GetMelScaleSpectrogram(
return image;
}

/// <summary>
/// Returns a cepstrogram image.
/// </summary>
public static Image<Rgb24> GetCepstrogram(
SonogramConfig sonoConfig,
SonogramConfig config,
AudioRecording recording,
string sourceRecordingName)
{
var cepstrogram = new SpectrogramCepstral(sonoConfig, recording.WavReader);
// Get the cepstrogram
var cepstrogram = new SpectrogramCepstral(config, recording.WavReader);

// Now prepare it as an image.
var image = cepstrogram.GetImage();
var titleBar = BaseSonogram.DrawTitleBarOfGrayScaleSpectrogram(
"CEPSTROGRAM " + sourceRecordingName,
image.Width,
ImageTags[CepstralSpectrogram]);
var startTime = TimeSpan.Zero;
var xAxisTicInterval = TimeSpan.FromSeconds(1);
TimeSpan xAxisPixelDuration = TimeSpan.FromSeconds(sonoConfig.WindowStep / (double)sonoConfig.SampleRate);
TimeSpan xAxisPixelDuration = TimeSpan.FromSeconds(config.WindowStep / (double)config.SampleRate);
var labelInterval = TimeSpan.FromSeconds(5);
image = BaseSonogram.FrameSonogram(image, titleBar, startTime, xAxisTicInterval, xAxisPixelDuration, labelInterval);
return image;
Expand Down
91 changes: 85 additions & 6 deletions src/AudioAnalysisTools/DSP/MFCCStuff.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace AudioAnalysisTools.DSP
public class MFCCStuff
{
/// <summary>
/// Converts spectral amplitudes directly to dB, normalising for window power and sample rate.
/// Converts amplitude spectra (in a spectrogram) directly to dB spectra, normalising for window power and sample rate.
/// NOTE 1: The window contributes power to the signal which must subsequently be removed from the spectral power.
/// NOTE 2: Spectral power must be normalised for sample rate. Effectively calculate freq power per sample.
/// NOTE 3: The power in all freq bins except f=0 must be doubled because the power spectrum is an even function about f=0;
Expand Down Expand Up @@ -81,6 +81,80 @@ public class MFCCStuff
return spectra;
}

/// <summary>
/// This method is similar to the above DecibelSpectra() method,
/// except that the passed spectrogram matrix contains energy values, i.e. squared amplitude values.
/// This method is used when calculating mfcc's. The passed energy spectrogram is output from the mel-frequency filter bank,
/// and the energy values are converted directly to log-energy, normalising for window power and sample rate.
/// Note that the output is log-energy, not decibels: decibels = 10 * log-energy
/// NOTE 1: The window contributes power to the signal which must subsequently be removed from the spectral power.
/// NOTE 2: Spectral power must be normalised for sample rate. Effectively calculate freq power per sample.
/// NOTE 3: The power in all freq bins except f=0 must be doubled because the power spectrum is an even function about f=0;
/// This is due to the fact that the spectrum actually consists of 512 + 1 values, the centre value being for f=0.
/// NOTE 5: THIS METHOD ASSUMES THAT THE LAST BIN IS THE NYQUIST FREQ BIN
/// NOTE 6: THIS METHOD ASSUMES THAT THE FIRST BIN IS THE MEAN or DC FREQ BIN.
/// </summary>
/// <param name="energyM"> the amplitude spectra. </param>
/// <param name="windowPower">value for window power normalisation.</param>
/// <param name="sampleRate">to NormaliseMatrixValues for the sampling rate.</param>
/// <param name="epsilon">small value to avoid log of zero.</param>
/// <returns>a spectrogram of decibel values.</returns>
public static double[,] GetLogOfEnergySpectrogram(double[,] energyM, double windowPower, int sampleRate, double epsilon)
{
int frameCount = energyM.GetLength(0);
int binCount = energyM.GetLength(1);
double minLogEnergy = Math.Log10(epsilon / windowPower / sampleRate);
double minLogEnergy2 = Math.Log10(epsilon * 2 / windowPower / sampleRate);

double[,] spectra = new double[frameCount, binCount];

//calculate power of the DC value - first column of matrix
for (int i = 0; i < frameCount; i++)
{
if (energyM[i, 0] < epsilon)
{
spectra[i, 0] = minLogEnergy;
}
else
{
spectra[i, 0] = Math.Log10(energyM[i, 0] / windowPower / sampleRate);
}
}

// calculate power in frequency bins - must multiply by 2 to accomodate two spectral components, ie positive and neg freq.
for (int j = 1; j < binCount - 1; j++)
{
// foreach time step or frame
for (int i = 0; i < frameCount; i++)
{
if (energyM[i, j] < epsilon)
{
spectra[i, j] = minLogEnergy2;
}
else
{
spectra[i, j] = Math.Log10(energyM[i, j] * 2 / windowPower / sampleRate);
}
}
} //end of all freq bins

//calculate power of the Nyquist freq bin - last column of matrix
for (int i = 0; i < frameCount; i++)
{
//calculate power of the DC value
if (energyM[i, binCount - 1] < epsilon)
{
spectra[i, binCount - 1] = minLogEnergy;
}
else
{
spectra[i, binCount - 1] = Math.Log10(energyM[i, binCount - 1] / windowPower / sampleRate);
}
}

return spectra;
}

public static int[] VocalizationDetection(double[] decibels, double lowerDbThreshold, double upperDbThreshold, int k1k2delay, int syllableGap, int minPulse, int[] zeroCrossings)
{
int length = decibels.Length;
Expand Down Expand Up @@ -421,7 +495,7 @@ public static double InverseHerzTranform(double m, double c, double div)
/// Uses Greg's MelIntegral
/// The first step is to calculate the number of filters for the required frequency sub-band.
/// </summary>
/// <param name="matrix">the sonogram.</param>
/// <param name="matrix">the spectrogram.</param>
/// <param name="filterBankCount">number of filters over full freq range 0 Hz - Nyquist.</param>
/// <param name="nyquist">max frequency in original spectra.</param>
/// <param name="minFreq">min freq in the passed sonogram matrix.</param>
Expand Down Expand Up @@ -675,10 +749,15 @@ public static double[] DCT(double[] spectrum, double[,] cosines)
//*********************************************** GET ACOUSTIC VECTORS

/// <summary>
/// This method assumes that the supplied mfcc matrix DOES NOT contain dB values in column one.
/// These are added in from the supplied dB array.
/// This method assumes that the supplied mfcc matrix DOES NOT contain frame dB (log energy) values in column zero.
/// These are added in from the supplied array of frame log-energies.
/// </summary>
public static double[,] AcousticVectors(double[,] mfcc, double[] dBNormed, bool includeDelta, bool includeDoubleDelta)
/// <param name="mfcc">A matrix of mfcc coefficients. Column zero is empty.</param>
/// <param name="frameDbNormed">log-energy values for the frames.</param>
/// <param name="includeDelta">Whether or not to add delta features.</param>
/// <param name="includeDoubleDelta">Whether or not to add double delta features.</param>
/// <returns>A matrix of complete mfcc values with additional deltas, frame energies etc.</returns>
public static double[,] AcousticVectors(double[,] mfcc, double[] frameDbNormed, bool includeDelta, bool includeDoubleDelta)
{
//both the matrix of mfcc's and the array of decibels have been normed in 0-1.
int frameCount = mfcc.GetLength(0); //number of time frames
Expand All @@ -701,7 +780,7 @@ public static double[] DCT(double[] spectrum, double[,] cosines)
// loop through the time frames and create feature vector for each frame.
for (int t = 0; t < frameCount; t++)
{
double[] fv = GetMfccFeatureVector(dBNormed, mfcc, t, includeDelta, includeDoubleDelta); //get feature vector for frame (t)
double[] fv = GetMfccFeatureVector(frameDbNormed, mfcc, t, includeDelta, includeDoubleDelta); //get feature vector for frame (t)

//transfer feature vector to the matrix of acoustic features.
for (int i = 0; i < dim; i++)
Expand Down
Loading

0 comments on commit d53bf26

Please sign in to comment.