diff --git a/src/feat/Makefile b/src/feat/Makefile index e987de55b38..af207402a15 100644 --- a/src/feat/Makefile +++ b/src/feat/Makefile @@ -6,18 +6,17 @@ include ../kaldi.mk TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \ feature-functions-test pitch-functions-test feature-sdc-test \ - resample-test online-feature-test signal-test + resample-test online-feature-test signal-test wave-reader-test OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \ feature-spectrogram.o mel-computations.o wave-reader.o \ pitch-functions.o resample.o online-feature.o signal.o \ - feature-window.o + feature-window.o LIBNAME = kaldi-feat ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk - diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc index 556160f8e53..aea89d0099d 100644 --- a/src/feat/online-feature-test.cc +++ b/src/feat/online-feature-test.cc @@ -21,22 +21,11 @@ #include "feat/online-feature.h" #include "feat/wave-reader.h" +#include "matrix/kaldi-matrix.h" #include "transform/transform-common.h" namespace kaldi { - -template static void AssertEqual(const Matrix &A, - const Matrix &B, - float tol = 0.001) { - KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols()); - for (MatrixIndexT i = 0;i < A.NumRows();i++) - for (MatrixIndexT j = 0;j < A.NumCols();j++) { - KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) < tol * std::max(1.0, - static_cast(std::abs(A(i, j))+std::abs(B(i, j))))); - } -} - void GetOutput(OnlineFeatureInterface *a, Matrix *output) { int32 dim = a->Dim(); diff --git a/src/feat/wave-reader-test.cc b/src/feat/wave-reader-test.cc new file mode 100644 index 00000000000..f9a71e8af34 --- /dev/null +++ b/src/feat/wave-reader-test.cc @@ -0,0 +1,217 @@ +// feat/wave-reader-test.cc + +// Copyright 2017 Smart Action LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-math.h" +#include "feat/wave-reader.h" +#include "matrix/kaldi-matrix.h" + +using namespace kaldi; + +// Ugly macros to package bytes in wave file order (low-endian). +#define BY(n,k) ((char)((uint32)(n) >> (8 * (k)) & 0xFF)) +#define WRD(n) BY(n,0), BY(n,1) +#define DWRD(n) BY(n,0), BY(n,1), BY(n,2), BY(n,3) + +static void UnitTestStereo8K() { + /* Reference file written with Adobe Audition (random data): +00000000 52 49 46 46 32 00 00 00 57 41 56 45 66 6d 74 20 |RIFF2...WAVEfmt | +00000010 12 00 00 00 01 00 02 00 40 1f 00 00 00 7d 00 00 |........@....}..| +00000020 04 00 10 00 00 00 64 61 74 61 0c 00 00 00 00 00 |......data......| +00000030 31 51 ff 21 f4 63 38 4c 26 60 |1Q.!.c8L&`| + */ + + const int hz = 8000; + const int byps = hz * 2 /* channels */ * 2 /* bytes/sample */; + const char file_data[] = { + 'R', 'I', 'F', 'F', + DWRD(50), // File length after this point. + 'W', 'A', 'V', 'E', + 'f', 'm', 't', ' ', + DWRD(18), // sizeof(struct WAVEFORMATEX) + WRD(1), // WORD wFormatTag; + WRD(2), // WORD nChannels; + DWRD(hz), // DWORD nSamplesPerSec; 40 1f 00 00 + DWRD(byps), // DWORD nAvgBytesPerSec; 00 7d 00 00 + WRD(4), // WORD nBlockAlign; + WRD(16), // WORD wBitsPerSample; + WRD(0), // WORD cbSize; + 'd', 'a', 't', 'a', + DWRD(12), // 'data' chunk length. + WRD(0), WRD(-1), + WRD(-32768), WRD(0), + WRD(32767), WRD(1) + }; + + const char expect_mat[] = "[ 0 -32768 32767 \n -1 0 1 ]"; + + // Read binary file data. + std::istringstream iws(std::string(file_data, sizeof file_data), + std::ios::in | std::ios::binary); + WaveData wave; + wave.Read(iws); + + // Read expected wave data. + std::istringstream ies(expect_mat, std::ios::in); + Matrix expected; + expected.Read(ies, false /* text */); + + AssertEqual(wave.SampFreq(), hz, 0); + AssertEqual(wave.Duration(), 3.0 /* samples */ / hz /* Hz */, 1E-6); + AssertEqual(wave.Data(), expected); +} + +static void UnitTestMono22K() { + /* Reference file written with Adobe Audition (random data): +00000000 52 49 46 46 30 00 00 00 57 41 56 45 66 6d 74 20 |RIFF0...WAVEfmt | +00000010 12 00 00 00 01 00 01 00 22 56 00 00 44 ac 00 00 |........"V..D...| +00000020 02 00 10 00 00 00 64 61 74 61 0a 00 00 00 25 36 |......data....%6| +00000030 cb 41 1b 4d 04 4e 62 3d |.A.M.Nb=| + */ + + const int hz = 22050; + const int byps = hz * 1 /* channels */ * 2 /* bytes/sample */; + const char file_data[] = { + 'R', 'I', 'F', 'F', + DWRD(48), // File length after this point. + 'W', 'A', 'V', 'E', + 'f', 'm', 't', ' ', + DWRD(18), // sizeof(struct WAVEFORMATEX) + WRD(1), // WORD wFormatTag; + WRD(1), // WORD nChannels; + DWRD(hz), // DWORD nSamplesPerSec; + DWRD(byps), // DWORD nAvgBytesPerSec; + WRD(2), // WORD nBlockAlign; + WRD(16), // WORD wBitsPerSample; + WRD(0), // WORD cbSize; + 'd', 'a', 't', 'a', + DWRD(10), // 'data' chunk length. + WRD(0), WRD(-1), WRD(-32768), WRD(32767), WRD(1) + }; + + const char expect_mat[] = "[ 0 -1 -32768 32767 1 ]"; + + // Read binary file data. + std::istringstream iws(std::string(file_data, sizeof file_data), + std::ios::in | std::ios::binary); + WaveData wave; + wave.Read(iws); + + // Read expected matrix. + std::istringstream ies(expect_mat, std::ios::in); + Matrix expected; + expected.Read(ies, false /* text */); + + AssertEqual(wave.SampFreq(), hz, 0); + AssertEqual(wave.Duration(), 5.0 /* samples */ / hz /* Hz */, 1E-6); + AssertEqual(wave.Data(), expected); +} + +static void UnitTestEndless1() { + const int hz = 8000; + const int byps = hz * 1 /* channels */ * 2 /* bytes/sample */; + const char file_data[] = { + 'R', 'I', 'F', 'F', + DWRD(0), // File length unknown + 'W', 'A', 'V', 'E', + 'f', 'm', 't', ' ', + DWRD(18), // sizeof(struct WAVEFORMATEX) + WRD(1), // WORD wFormatTag; + WRD(1), // WORD nChannels; + DWRD(hz), // DWORD nSamplesPerSec; + DWRD(byps), // DWORD nAvgBytesPerSec; + WRD(2), // WORD nBlockAlign; + WRD(16), // WORD wBitsPerSample; + WRD(0), // WORD cbSize; + 'd', 'a', 't', 'a', + DWRD(0), // 'data' chunk length unknown. + WRD(1), WRD(2), WRD(3) + }; + + const char expect_mat[] = "[ 1 2 3 ]"; + + // Read binary file data. + std::istringstream iws(std::string(file_data, sizeof file_data), + std::ios::in | std::ios::binary); + WaveData wave; + wave.Read(iws); + + // Read expected matrix. + std::istringstream ies(expect_mat, std::ios::in); + Matrix expected; + expected.Read(ies, false /* text */); + + AssertEqual(wave.Data(), expected); +} + +static void UnitTestEndless2() { + const int hz = 8000; + const int byps = hz * 1 /* channels */ * 2 /* bytes/sample */; + const char file_data[] = { + 'R', 'I', 'F', 'F', + DWRD(-1), // File length unknown + 'W', 'A', 'V', 'E', + 'f', 'm', 't', ' ', + DWRD(18), // sizeof(struct WAVEFORMATEX) + WRD(1), // WORD wFormatTag; + WRD(1), // WORD nChannels; + DWRD(hz), // DWORD nSamplesPerSec; + DWRD(byps), // DWORD nAvgBytesPerSec; + WRD(2), // WORD nBlockAlign; + WRD(16), // WORD wBitsPerSample; + WRD(0), // WORD cbSize; + 'd', 'a', 't', 'a', + DWRD(-1), // 'data' chunk length unknown. + WRD(1), WRD(2), WRD(3) + }; + + const char expect_mat[] = "[ 1 2 3 ]"; + + // Read binary file data. + std::istringstream iws(std::string(file_data, sizeof file_data), + std::ios::in | std::ios::binary); + WaveData wave; + wave.Read(iws); + + // Read expected matrix. + std::istringstream ies(expect_mat, std::ios::in); + Matrix expected; + expected.Read(ies, false /* text */); + + AssertEqual(wave.Data(), expected); +} + +static void UnitTest() { + UnitTestStereo8K(); + UnitTestMono22K(); + UnitTestEndless1(); + UnitTestEndless2(); +} + +int main() { + try { + UnitTest(); + std::cout << "LGTM\n"; + return 0; + } catch (const std::exception &e) { + std::cerr << e.what(); + return 1; + } +} diff --git a/src/feat/wave-reader.cc b/src/feat/wave-reader.cc index 389b461d86c..62fcd7601d8 100644 --- a/src/feat/wave-reader.cc +++ b/src/feat/wave-reader.cc @@ -30,53 +30,59 @@ namespace kaldi { -// static -void WaveData::Expect4ByteTag(std::istream &is, const char *expected) { - char tmp[5]; - tmp[4] = '\0'; - is.read(tmp, 4); - if (is.fail()) - KALDI_ERR << "WaveData: expected " << expected << ", failed to read anything"; - if (strcmp(tmp, expected)) - KALDI_ERR << "WaveData: expected " << expected << ", got " << tmp; -} +// A utility class for reading wave header. +struct WaveHeaderReadGofer { + std::istream &is; + bool swap; + char tag[5]; + + WaveHeaderReadGofer(std::istream &is) : is(is), swap(false) { + memset(tag, '\0', sizeof tag); + } -uint32 WaveData::ReadUint32(std::istream &is, bool swap) { - union { - char result[4]; - uint32 ans; - } u; - is.read(u.result, 4); - if (swap) - KALDI_SWAP4(u.result); - if (is.fail()) - KALDI_ERR << "WaveData: unexpected end of file."; - return u.ans; -} + void Expect4ByteTag(const char *expected) { + is.read(tag, 4); + if (is.fail()) + KALDI_ERR << "WaveData: expected " << expected + << ", failed to read anything"; + if (strcmp(tag, expected)) + KALDI_ERR << "WaveData: expected " << expected << ", got " << tag; + } + void Read4ByteTag() { + is.read(tag, 4); + if (is.fail()) + KALDI_ERR << "WaveData: expected 4-byte chunk-name, got read error"; + } -uint16 WaveData::ReadUint16(std::istream &is, bool swap) { - union { - char result[2]; - int16 ans; - } u; - is.read(u.result, 2); - if (swap) - KALDI_SWAP2(u.result); - if (is.fail()) - KALDI_ERR << "WaveData: unexpected end of file."; - return u.ans; -} + uint32 ReadUint32() { + union { + char result[4]; + uint32 ans; + } u; + is.read(u.result, 4); + if (swap) + KALDI_SWAP4(u.result); + if (is.fail()) + KALDI_ERR << "WaveData: unexpected end of file or read error"; + return u.ans; + } -// static -void WaveData::Read4ByteTag(std::istream &is, char *dest) { - is.read(dest, 4); - if (is.fail()) - KALDI_ERR << "WaveData: expected 4-byte chunk-name, got read errror"; -} + uint16 ReadUint16() { + union { + char result[2]; + int16 ans; + } u; + is.read(u.result, 2); + if (swap) + KALDI_SWAP2(u.result); + if (is.fail()) + KALDI_ERR << "WaveData: unexpected end of file or read error"; + return u.ans; + } +}; -// static -void WaveData::WriteUint32(std::ostream &os, int32 i) { +static void WriteUint32(std::ostream &os, int32 i) { union { char buf[4]; int i; @@ -90,7 +96,7 @@ void WaveData::WriteUint32(std::ostream &os, int32 i) { KALDI_ERR << "WaveData: error writing to stream."; } -void WaveData::WriteUint16(std::ostream &os, int16 i) { +static void WriteUint16(std::ostream &os, int16 i) { union { char buf[2]; int16 i; @@ -104,57 +110,54 @@ void WaveData::WriteUint16(std::ostream &os, int16 i) { KALDI_ERR << "WaveData: error writing to stream."; } - - -void WaveData::Read(std::istream &is, ReadDataType read_data) { - data_.Resize(0, 0); // clear the data. - - char tmp[5]; - tmp[4] = '\0'; - Read4ByteTag(is, &tmp[0]); - bool is_rifx = false; - if (!strcmp(tmp, "RIFX")) - is_rifx = true; - else if (strcmp(tmp, "RIFF")) - KALDI_ERR << "WaveData: expected RIFF or RIFX, got " << tmp; +void WaveInfo::Read(std::istream &is) { + WaveHeaderReadGofer reader(is); + reader.Read4ByteTag(); + if (strcmp(reader.tag, "RIFF") == 0) + reverse_bytes_ = false; + else if (strcmp(reader.tag, "RIFX") == 0) + reverse_bytes_ = true; + else + KALDI_ERR << "WaveData: expected RIFF or RIFX, got " << reader.tag; #ifdef __BIG_ENDIAN__ - bool swap = !is_rifx; -#else - bool swap = is_rifx; + reverse_bytes_ = !reverse_bytes_; #endif + reader.swap = reverse_bytes_; - uint32 riff_chunk_size = ReadUint32(is, swap); - Expect4ByteTag(is, "WAVE"); + uint32 riff_chunk_size = reader.ReadUint32(); + reader.Expect4ByteTag("WAVE"); uint32 riff_chunk_read = 0; riff_chunk_read += 4; // WAVE included in riff_chunk_size. - Expect4ByteTag(is, "fmt "); - uint32 subchunk1_size = ReadUint32(is, swap); - uint16 audio_format = ReadUint16(is, swap), - num_channels = ReadUint16(is, swap); - uint32 sample_rate = ReadUint32(is, swap), - byte_rate = ReadUint32(is, swap), - block_align = ReadUint16(is, swap), - bits_per_sample = ReadUint16(is, swap); - uint32 fmt_chunk_read = 16; + reader.Expect4ByteTag("fmt "); + uint32 subchunk1_size = reader.ReadUint32(); + uint16 audio_format = reader.ReadUint16(); + num_channels_ = reader.ReadUint16(); + uint32 sample_rate = reader.ReadUint32(), + byte_rate = reader.ReadUint32(), + block_align = reader.ReadUint16(), + bits_per_sample = reader.ReadUint16(); + samp_freq_ = static_cast(sample_rate); + uint32 fmt_chunk_read = 16; if (audio_format == 1) { if (subchunk1_size < 16) { - KALDI_ERR << "WaveData: expect PCM format data to have fmt chunk of at least size 16."; + KALDI_ERR << "WaveData: expect PCM format data to have fmt chunk " + << "of at least size 16."; } } else if (audio_format == 0xFFFE) { // WAVE_FORMAT_EXTENSIBLE - uint16 extra_size = ReadUint16(is, swap); + uint16 extra_size = reader.ReadUint16(); if (subchunk1_size < 40 || extra_size < 22) { KALDI_ERR << "WaveData: malformed WAVE_FORMAT_EXTENSIBLE format data."; } - ReadUint16(is, swap); // Unused for PCM. - ReadUint32(is, swap); // Channel map: we do not care. - uint32 guid1 = ReadUint32(is, swap), - guid2 = ReadUint32(is, swap), - guid3 = ReadUint32(is, swap), - guid4 = ReadUint32(is, swap); + reader.ReadUint16(); // Unused for PCM. + reader.ReadUint32(); // Channel map: we do not care. + uint32 guid1 = reader.ReadUint32(), + guid2 = reader.ReadUint32(), + guid3 = reader.ReadUint32(), + guid4 = reader.ReadUint32(); fmt_chunk_read = 40; // Support only KSDATAFORMAT_SUBTYPE_PCM for now. Interesting formats: @@ -164,27 +167,27 @@ void WaveData::Read(std::istream &is, ReadDataType read_data) { // ("00000007-0000-0010-8000-00aa00389b71", KSDATAFORMAT_SUBTYPE_MULAW) if (guid1 != 0x00000001 || guid2 != 0x00100000 || guid3 != 0xAA000080 || guid4 != 0x719B3800) { - KALDI_ERR << "WaveData: unknown/unsupported WAVE_FORMAT_EXTENSIBLE format."; + KALDI_ERR << "WaveData: unsupported WAVE_FORMAT_EXTENSIBLE format."; } } else { KALDI_ERR << "WaveData: can read only PCM data, format id in file is: " << audio_format; } - for (uint32 i = fmt_chunk_read; i < subchunk1_size; i++) is.get(); // use up extra data. + for (uint32 i = fmt_chunk_read; i < subchunk1_size; ++i) + is.get(); // use up extra data. - if (num_channels <= 0) + if (num_channels_ == 0) KALDI_ERR << "WaveData: no channels present"; - samp_freq_ = static_cast(sample_rate); - if (bits_per_sample != 8 && bits_per_sample != 16 && bits_per_sample != 32) - KALDI_ERR << "WaveData: bits_per_sample is " << bits_per_sample; - if (byte_rate != sample_rate * bits_per_sample/8 * num_channels) + if (bits_per_sample != 16) + KALDI_ERR << "WaveData: unsupported bits_per_sample = " << bits_per_sample; + if (byte_rate != sample_rate * bits_per_sample/8 * num_channels_) KALDI_ERR << "Unexpected byte rate " << byte_rate << " vs. " - << sample_rate <<" * " << (bits_per_sample/8) - << " * " << num_channels; - if (block_align != num_channels * bits_per_sample/8) + << sample_rate << " * " << (bits_per_sample/8) + << " * " << num_channels_; + if (block_align != num_channels_ * bits_per_sample/8) KALDI_ERR << "Unexpected block_align: " << block_align << " vs. " - << num_channels << " * " << (bits_per_sample/8); + << num_channels_ << " * " << (bits_per_sample/8); riff_chunk_read += 8 + subchunk1_size; // size of what we just read, 4 bytes for "fmt " + 4 @@ -193,38 +196,52 @@ void WaveData::Read(std::istream &is, ReadDataType read_data) { // We support an optional "fact" chunk (which is useless but which // we encountered), and then a single "data" chunk. - char next_chunk_name[4]; - Read4ByteTag(is, next_chunk_name); + reader.Read4ByteTag(); riff_chunk_read += 4; // Skip any subchunks between "fmt" and "data". Usually there will // be a single "fact" subchunk, but on Windows there can also be a // "list" subchunk. - while (strncmp(next_chunk_name, "data", 4) != 0) { + while (strcmp(reader.tag, "data") != 0) { // We will just ignore the data in these chunks. - uint32 chunk_sz = ReadUint32(is, swap); - if (chunk_sz != 4 && strncmp(next_chunk_name, "fact", 4) == 0) + uint32 chunk_sz = reader.ReadUint32(); + if (chunk_sz != 4 && strcmp(reader.tag, "fact") == 0) KALDI_WARN << "Expected fact chunk to be 4 bytes long."; for (uint32 i = 0; i < chunk_sz; i++) is.get(); riff_chunk_read += 4 + chunk_sz; // for chunk_sz (4) + chunk contents (chunk-sz) // Now read the next chunk name. - Read4ByteTag(is, next_chunk_name); + reader.Read4ByteTag(); riff_chunk_read += 4; } - if (strncmp(next_chunk_name, "data", 4)) + if (strcmp(reader.tag, "data")) KALDI_ERR << "WaveData: expected data chunk, got instead " - << next_chunk_name; + << reader.tag; - uint32 data_chunk_size = ReadUint32(is, swap); + uint32 data_chunk_size = reader.ReadUint32(); riff_chunk_read += 4; - if (std::abs(static_cast(riff_chunk_read) + - static_cast(data_chunk_size) - - static_cast(riff_chunk_size)) > 1) { - // we allow the size to be off by one without warning, because there is a + // Figure out if the file is going to be read to the end. Values as + // observed in the wild: + bool is_stream_mode = + riff_chunk_size == 0 + || riff_chunk_size == 0xFFFFFFFF + || data_chunk_size == 0 + || data_chunk_size == 0xFFFFFFFF + || data_chunk_size == 0x7FFFF000; // This value is used by SoX. + + if (is_stream_mode) + KALDI_VLOG(1) << "Read in RIFF chunk size: " << riff_chunk_size + << ", data chunk size: " << data_chunk_size + << ". Assume 'stream mode' (reading data to EOF)."; + + if (!is_stream_mode + && std::abs(static_cast(riff_chunk_read) + + static_cast(data_chunk_size) - + static_cast(riff_chunk_size)) > 1) { + // We allow the size to be off by one without warning, because there is a // weirdness in the format of RIFF files that means that the input may // sometimes be padded with 1 unused byte to make the total size even. KALDI_WARN << "Expected " << riff_chunk_size << " bytes in RIFF chunk, but " @@ -233,87 +250,60 @@ void WaveData::Read(std::istream &is, ReadDataType read_data) { << "(we do not support reading multiple data chunks)."; } - if (read_data == kLeaveDataUndefined) { - // we won't actually be reading the data- we'll just be faking that we read - // that data, so the caller can get the metadata. - // assume we'd read the same number of bytes that the data-chunk header - // says we'd read. - int32 num_bytes_read = data_chunk_size; - uint32 num_samp = num_bytes_read / block_align; - data_.Resize(num_channels, num_samp, kUndefined); - return; - } else { - KALDI_ASSERT(read_data == kReadData); - } + if (is_stream_mode) + samp_count_ = -1; + else + samp_count_ = data_chunk_size / block_align; +} - std::vector data_pointer_vec; - std::vector data_size_vec; - uint32 num_bytes_read = 0; - for (int32 remain_chunk_size = data_chunk_size; remain_chunk_size > 0; - remain_chunk_size -= kBlockSize) { - int32 this_block_size = remain_chunk_size; - if (kBlockSize < remain_chunk_size) - this_block_size = kBlockSize; - char *block_data_vec = new char[this_block_size]; - is.read(block_data_vec, this_block_size); - num_bytes_read += is.gcount(); - data_size_vec.push_back(is.gcount()); - data_pointer_vec.push_back(block_data_vec); - if (num_bytes_read < this_block_size) - break; - } +void WaveData::Read(std::istream &is) { + const uint32 kBlockSize = 1024 * 1024; - std::vector chunk_data_vec(num_bytes_read); - uint32 data_address = 0; - for (int i = 0; i < data_pointer_vec.size(); i++) { - memcpy(&(chunk_data_vec[data_address]), data_pointer_vec[i], - data_size_vec[i]); - delete[] data_pointer_vec[i]; - data_address += data_size_vec[i]; - } + WaveInfo header; + header.Read(is); - char *data_ptr = &(chunk_data_vec[0]); - if (num_bytes_read == 0 && num_bytes_read != data_chunk_size) { - KALDI_ERR << "WaveData: failed to read data chunk (read no bytes)"; - } else if (num_bytes_read != data_chunk_size) { - KALDI_ASSERT(num_bytes_read < data_chunk_size); - KALDI_WARN << "Read fewer bytes than specified in the header: " - << num_bytes_read << " < " << data_chunk_size; + data_.Resize(0, 0); // clear the data. + samp_freq_ = header.SampFreq(); + + std::vector buffer; + uint32 bytes_to_go = header.IsStreamed() ? kBlockSize : header.DataBytes(); + + // Once in a while header.DataBytes() will report an insane value; + // read the file to the end + while (is && bytes_to_go > 0) { + uint32 block_bytes = std::min(bytes_to_go, kBlockSize); + uint32 offset = buffer.size(); + buffer.resize(offset + block_bytes); + is.read(&buffer[offset], block_bytes); + uint32 bytes_read = is.gcount(); + buffer.resize(offset + bytes_read); + if (!header.IsStreamed()) + bytes_to_go -= bytes_read; } - if (data_chunk_size == 0) + if (is.bad()) + KALDI_ERR << "WaveData: file read error"; + + if (buffer.size() == 0) KALDI_ERR << "WaveData: empty file (no data)"; - uint32 num_samp = num_bytes_read / block_align; - data_.Resize(num_channels, num_samp); - for (uint32 i = 0; i < num_samp; i++) { - for (uint32 j = 0; j < num_channels; j++) { - switch (bits_per_sample) { - case 8: - data_(j, i) = *data_ptr; - data_ptr++; - break; - case 16: - { - int16 k = *reinterpret_cast(data_ptr); - if (swap) - KALDI_SWAP2(k); - data_(j, i) = k; - data_ptr += 2; - break; - } - case 32: - { - int32 k = *reinterpret_cast(data_ptr); - if (swap) - KALDI_SWAP4(k); - data_(j, i) = k; - data_ptr += 4; - break; - } - default: - KALDI_ERR << "bits per sample is " << bits_per_sample; // already checked this. - } + if (!header.IsStreamed() && buffer.size() < header.DataBytes()) { + KALDI_WARN << "Expected " << header.DataBytes() << " bytes of wave data, " + << "but read only " << buffer.size() << " bytes. " + << "Truncated file?"; + } + + uint16 *data_ptr = reinterpret_cast(&buffer[0]); + + // The matrix is arranged row per channel, column per sample. + data_.Resize(header.NumChannels(), + buffer.size() / header.BlockAlign()); + for (uint32 i = 0; i < data_.NumCols(); ++i) { + for (uint32 j = 0; j < data_.NumRows(); ++j) { + int16 k = *data_ptr++; + if (header.ReverseBytes()) + KALDI_SWAP2(k); + data_(j, i) = k; } } } diff --git a/src/feat/wave-reader.h b/src/feat/wave-reader.h index 0749022f7d7..d7e1e2dd5ae 100644 --- a/src/feat/wave-reader.h +++ b/src/feat/wave-reader.h @@ -61,11 +61,50 @@ namespace kaldi { /// (2^15-1)*[-1, 1], not the usual default DSP range [-1, 1]. const BaseFloat kWaveSampleMax = 32768.0; +/// This class reads and hold wave file header information. +class WaveInfo { + public: + WaveInfo() : samp_freq_(0), samp_count_(0), + num_channels_(0), reverse_bytes_(0) {} + + /// Is stream size unknown? Duration and SampleCount not valid if true. + bool IsStreamed() const { return samp_count_ < 0; } + + /// Sample frequency, Hz. + BaseFloat SampFreq() const { return samp_freq_; } + + /// Number of samples in stream. Invalid if IsStreamed() is true. + uint32 SampleCount() const { return samp_count_; } + + /// Approximate duration, seconds. Invalid if IsStreamed() is true. + BaseFloat Duration() const { return samp_count_ / samp_freq_; } + + /// Number of channels, 1 to 16. + int32 NumChannels() const { return num_channels_; } + + /// Bytes per sample. + size_t BlockAlign() const { return 2 * num_channels_; } + + /// Wave data bytes. Invalid if IsStreamed() is true. + size_t DataBytes() const { return samp_count_ * BlockAlign(); } + + /// Is data file byte order different from machine byte order? + bool ReverseBytes() const { return reverse_bytes_; } + + /// 'is' should be opened in binary mode. Read() will throw on error. + /// On success 'is' will be positioned at the beginning of wave data. + void Read(std::istream &is); + + private: + BaseFloat samp_freq_; + int32 samp_count_; // 0 if empty, -1 if undefined length. + uint8 num_channels_; + bool reverse_bytes_; // File endianness differs from host. +}; + /// This class's purpose is to read in Wave files. class WaveData { public: - enum ReadDataType { kReadData, kLeaveDataUndefined }; - WaveData(BaseFloat samp_freq, const MatrixBase &data) : data_(data), samp_freq_(samp_freq) {} @@ -74,7 +113,7 @@ class WaveData { /// Read() will throw on error. It's valid to call Read() more than once-- /// in this case it will destroy what was there before. /// "is" should be opened in binary mode. - void Read(std::istream &is, ReadDataType read_data = kReadData); + void Read(std::istream &is); /// Write() will throw on error. os should be opened in binary mode. void Write(std::ostream &os) const; @@ -108,13 +147,6 @@ class WaveData { static const uint32 kBlockSize = 1024 * 1024; // Use 1M bytes. Matrix data_; BaseFloat samp_freq_; - static void Expect4ByteTag(std::istream &is, const char *expected); - uint32 ReadUint32(std::istream &is, bool swap); - uint16 ReadUint16(std::istream &is, bool swap); - static void Read4ByteTag(std::istream &is, char *dest); - - static void WriteUint32(std::ostream &os, int32 i); - static void WriteUint16(std::ostream &os, int16 i); }; @@ -135,7 +167,7 @@ class WaveHolder { t.Write(os); // throws exception on failure. return true; } catch (const std::exception &e) { - KALDI_WARN << "Exception caught in WaveHolder object (writing). " + KALDI_WARN << "Exception caught in WaveHolder object (writing). " << e.what(); return false; // write failure. } @@ -159,12 +191,11 @@ class WaveHolder { bool Read(std::istream &is) { // We don't look for the binary-mode header here [always binary] try { - t_.Read(is); // throws exception on failure. + t_.Read(is); // Throws exception on failure. return true; } catch (const std::exception &e) { - KALDI_WARN << "Exception caught in WaveHolder object (reading). " - << e.what(); - return false; // write failure. + KALDI_WARN << "Exception caught in WaveHolder::Read(). " << e.what(); + return false; } } @@ -185,50 +216,30 @@ class WaveHolder { // it leaves the actual data undefined, it doesn't read it. class WaveInfoHolder { public: - typedef WaveData T; - - static bool Write(std::ostream &os, bool binary, const T &t) { - KALDI_ERR << "This holder type does not support writing."; - return true; - } - - void Copy(const T &t) { t_.CopyFrom(t); } + typedef WaveInfo T; + void Clear() { info_ = WaveInfo(); } + void Swap(WaveInfoHolder *other) { std::swap(info_, other->info_); } + const T &Value() { return info_; } static bool IsReadInBinary() { return true; } - void Clear() { t_.Clear(); } - - const T &Value() { return t_; } - - WaveInfoHolder &operator = (const WaveInfoHolder &other) { - t_.CopyFrom(other.t_); - return *this; - } - WaveInfoHolder(const WaveInfoHolder &other): t_(other.t_) {} - - WaveInfoHolder() {} - bool Read(std::istream &is) { try { - t_.Read(is, WaveData::kLeaveDataUndefined); // throws exception on failure. + info_.Read(is); // Throws exception on failure. return true; } catch (const std::exception &e) { - KALDI_WARN << "Exception caught in WaveHolder object (reading). " - << e.what(); - return false; // write failure. + KALDI_WARN << "Exception caught in WaveInfoHolder::Read(). " << e.what(); + return false; } } - void Swap(WaveInfoHolder *other) { - t_.Swap(&(other->t_)); - } - bool ExtractRange(const WaveInfoHolder &other, const std::string &range) { KALDI_ERR << "ExtractRange is not defined for this type of holder."; return false; } + private: - T t_; + WaveInfo info_; }; diff --git a/src/featbin/wav-to-duration.cc b/src/featbin/wav-to-duration.cc index 2eb95dc3fc1..8d043ed73bb 100644 --- a/src/featbin/wav-to-duration.cc +++ b/src/featbin/wav-to-duration.cc @@ -18,9 +18,9 @@ // limitations under the License. #include "base/kaldi-common.h" -#include "util/common-utils.h" #include "feat/feature-mfcc.h" #include "feat/wave-reader.h" +#include "util/common-utils.h" int main(int argc, char *argv[]) { try { @@ -71,13 +71,15 @@ int main(int argc, char *argv[]) { max_duration = std::max(max_duration, duration); num_done++; } - } - else { + } else { SequentialTableReader wav_reader(wav_rspecifier); for (; !wav_reader.Done(); wav_reader.Next()) { std::string key = wav_reader.Key(); - const WaveData &wave_data = wav_reader.Value(); - BaseFloat duration = wave_data.Duration(); + const WaveInfo &wave_info = wav_reader.Value(); + if (wave_info.IsStreamed()) + KALDI_ERR << "Error: member " << key << " has no duration in header. " + << "Check the source, and/or try --read-entire-file."; + BaseFloat duration = wave_info.Duration(); duration_writer.Write(key, duration); sum_duration += duration; @@ -99,4 +101,3 @@ int main(int argc, char *argv[]) { return -1; } } -