apache · sandeep-krishnamurthy · Dec 1, 2018 · Nov 17, 2018 · Nov 20, 2018 · Nov 20, 2018
diff --git a/example/gluon/urban_sounds/README.md b/example/gluon/urban_sounds/README.md
@@ -0,0 +1,65 @@
+# Urban Sounds classification in MXNet
+
+Urban Sounds Dataset:
+## Description
+  The dataset contains 8732 wav files which are audio samples(<= 4s)) of street sounds like engine_idling, car_horn, children_playing, dog_barking and so on.
+  The task is to classify these audio samples into one of the 10 labels.
+
+To be able to run this example:
+
+1. Download the dataset(train.zip, test.zip) required for this example from the location:
+**https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU**
+
+
+2. Extract both the zip archives into the **current directory** - after unzipping you would get 2 new folders namely,\
+   **Train** and **Test** and two csv files - **train.csv**, **test.csv**
+
+3. Apache MXNet is installed on the machine. For instructions, go to the link: **https://mxnet.incubator.apache.org/install/**
+
+4. Librosa is installed. To install, use the commands
+   `pip install librosa`,
+   For more details, refer here:
+   **https://librosa.github.io/librosa/install.html**
+
+
+For information on the current design of how the AudioFolderDataset is implemented, refer below:
+**https://cwiki.apache.org/confluence/display/MXNET/Gluon+-+Audio**
+
+## Usage 
+
+For training:
+
+- arguments
+  - train : The folder/directory that contains the audio(wav) files locally. Default = "./Train"
+  - csv: The file name of the csv file that contains audio file name to label mapping. Default = "train.csv"
+  - epochs : Number of epochs to train the model. Default = 30
+  - batch_size : The batch size for training. Default = 32
+
+
+###### default setting
+```
+python train.py
+``` 
+or
+
+###### manual setting
+```
+python train.py --train ./Train --csv train.csv --batch_size 32 --epochs 30 
+```
+
+For prediction:
+
+- arguments
+  - pred : The folder/directory that contains the audio(wav) files which are to be classified. Default = "./Test"
+
+
+###### default setting
+```
+python predict.py
+``` 
+or
+
+###### manual setting
+```
+python train.py --pred ./Test
+```
diff --git a/example/gluon/urban_sounds/datasets.py b/example/gluon/urban_sounds/datasets.py
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=
+""" Audio Dataset container."""
+__all__ = ['AudioFolderDataset']
+
+import os
+import warnings
+from mxnet.gluon.data import Dataset
+from mxnet import ndarray as nd
+try:
+    import librosa
+except ImportError as e:
+    warnings.warn("librosa dependency could not be resolved or \
+    imported, could not load audio onto the numpy array. pip install librosa")
+
+
+class AudioFolderDataset(Dataset):
+    """A dataset for loading Audio files stored in a folder structure like::
+
+        root/children_playing/0.wav
+        root/siren/23.wav
+        root/drilling/26.wav
+        root/dog_barking/42.wav
+            OR
+        Files(wav) and a csv file that has file name and associated label
+
+    Parameters
+    ----------
+    root : str
+        Path to root directory.
+    transform : callable, default None
+        A function that takes data and label and transforms them
+    train_csv: str, default None
+       train_csv should be populated by the training csv filename
+    file_format: str, default '.wav'
+        The format of the audio files(.wav)
+    skip_header: boolean, default False
+        While reading from csv file, whether to skip at the start of the file to avoid reading in header
+
+
+    Attributes
+    ----------
+    synsets : list
+        List of class names. `synsets[i]` is the name for the integer label `i`
+    items : list of tuples
+        List of all audio in (filename, label) pairs.
+
+    """
+    def __init__(self, root, train_csv=None, file_format='.wav', skip_header=False):
+        if not librosa:
+            warnings.warn("pip install librosa to continue.")
+            return
+        self._root = os.path.expanduser(root)
+        self._exts = ['.wav']
+        self._format = file_format
+        self._train_csv = train_csv
+        if file_format.lower() not in self._exts:
+            raise RuntimeError("format {} not supported currently.".format(file_format))
+        if skip_header:
+            skip_rows = 1
+        else:
+            skip_rows = 0
+        self._list_audio_files(self._root, skip_rows=skip_rows)
+
+
+    def _list_audio_files(self, root, skip_rows=0):
+        """Populates synsets - a map of index to label for the data items.
+        Populates the data in the dataset, making tuples of (data, label)
+        """
+        self.synsets = []
+        self.items = []
+        if self._train_csv is None:
+            for folder in sorted(os.listdir(root)):
+                path = os.path.join(root, folder)
+                if not os.path.isdir(path):
+                    warnings.warn('Ignoring {}, which is not a directory.'.format(path))
+                    continue
+                label = len(self.synsets)
+                self.synsets.append(folder)
+                for filename in sorted(os.listdir(path)):
+                    file_name = os.path.join(path, filename)
+                    ext = os.path.splitext(file_name)[1]
+                    if ext.lower() not in self._exts:
+                        warnings.warn('Ignoring {} of type {}. Only support {}'\
+                        .format(filename, ext, ', '.join(self._exts)))
+                        continue
+                    self.items.append((file_name, label))
+        else:
+            skipped_rows = 0
+            with open(self._train_csv, "r") as traincsv:
+                for line in traincsv:
+                    skipped_rows = skipped_rows + 1
+                    if skipped_rows <= skip_rows:
+                        continue
+                    filename = os.path.join(root, line.split(",")[0])
+                    label = line.split(",")[1].strip()
+                    if label not in self.synsets:
+                        self.synsets.append(label)
+                    if self._format not in filename:
+                        filename = filename+self._format
+                    self.items.append((filename, nd.array([self.synsets.index(label)]).reshape((1,))))
+
+            #Generating the synset.txt file now
+            if not os.path.exists("./synset.txt"):
+                with open("./synset.txt", "w") as synsets_file:
+                    for item in self.synsets:
+                        synsets_file.write(item+os.linesep)
+                print("Synsets is generated as synset.txt")
+            else:
+                warnings.warn("Synset file already exists in the current directory! Not generating synset.txt.")
+
+
+    def __getitem__(self, idx):
+        """Retrieve the item (data, label) stored at idx in items"""
+        filename = self.items[idx][0]
+        label = self.items[idx][1]
+        if librosa is not None:
+            X1, _ = librosa.load(filename, res_type='kaiser_fast')
+            return nd.array(X1), label
+        else:
+            warnings.warn(" Dependency librosa is not installed! \
+            Cannot load the audio(wav) file into the numpy.ndarray.")
+            return self.items[idx][0], self.items[idx][1]
+
+    def __len__(self):
+        """Retrieves the number of items in the dataset"""
+        return len(self.items)
+
+
+    def transform_first(self, fn, lazy=False):
+        """Returns a new dataset with the first element of each sample
+        transformed by the transformer function `fn`.
+
+        This is useful, for example, when you only want to transform data
+        while keeping label as is.
+        lazy=False is passed to transform_first for dataset so that all tramsforms could be performed in
+        one shot and not during training. This is a performance consideration.
+
+        Parameters
+        ----------
+        fn : callable
+            A transformer function that takes the first element of a sample
+            as input and returns the transformed element.
+        lazy : bool, default True
+            If False, transforms all samples at once. Otherwise,
+            transforms each sample on demand. Note that if `fn`
+            is stochastic, you must set lazy to True or you will
+            get the same result on all epochs.
+
+        Returns
+        -------
+        Dataset
+            The transformed dataset.
+
+        """
+        return super(AudioFolderDataset, self).transform_first(fn, lazy=False)
-        return super(AudioFolderDataset, self).transform_first(fn, lazy=False)
+        return super(AudioFolderDataset, self).transform_first(fn, lazy=lazy)
-        return super(AudioFolderDataset, self).transform_first(fn, lazy=False)
+        return super(AudioFolderDataset, self).transform_first(fn, lazy=lazy)
diff --git a/example/gluon/urban_sounds/model.py b/example/gluon/urban_sounds/model.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""This module builds a model an MLP with a configurable output layer( number of units in the last layer).
+Users can pass any number of units in the last layer. SInce this dataset has 10 labels,
+the default value of num_labels = 10
+"""
+import mxnet as mx
+from mxnet import gluon
+
+# Defining a neural network with number of labels
+def get_net(num_labels=10):
+    net = gluon.nn.Sequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
+        net.add(gluon.nn.Dense(256, activation="relu")) # 2nd hidden layer ( 256 nodes )
+        net.add(gluon.nn.Dense(num_labels))
+    net.collect_params().initialize(mx.init.Xavier())
+    return net
diff --git a/example/gluon/urban_sounds/predict.py b/example/gluon/urban_sounds/predict.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Prediction module for Urban Sounds Classification
+"""
+import os
+import warnings
+import mxnet as mx
+from mxnet import nd
+from transforms import MFCC
+from model import get_net
+
+def predict(prediction_dir='./Test'):
+    """The function is used to run predictions on the audio files in the directory `pred_directory`.
+
+    Parameters
+    ----------
+    net:
+        The model that has been trained.
+    prediction_dir: string, default ./Test
+        The directory that contains the audio files on which predictions are to be made
+
+    """
+
+    try:
+        import librosa
+    except ImportError:
+        warnings.warn("Librosa is not installed! please run the following command pip install librosa.")
+        return
+
+    if not os.path.exists(prediction_dir):
+        warnings.warn("The directory on which predictions are to be made is not found!")
+        return
+
+    if len(os.listdir(prediction_dir)) == 0:
+        warnings.warn("The directory on which predictions are to be made is empty! Exiting...")
+        return
+
+    # Loading synsets
+    if not os.path.exists('./synset.txt'):
+        warnings.warn("The synset or labels for the dataset do not exist. Please run the training script first.")
+        return
+
+    with open("./synset.txt", "r") as f:
+        synset = [l.rstrip() for l in f]
+    net = get_net(len(synset))
+    print("Trying to load the model with the saved parameters...")
+    if not os.path.exists("./net.params"):
+        warnings.warn("The model does not have any saved parameters... Cannot proceed! Train the model first")
+        return
+
+    net.load_parameters("./net.params")
+    file_names = os.listdir(prediction_dir)
+    full_file_names = [os.path.join(prediction_dir, item) for item in file_names]
+    mfcc = MFCC()
+    print("\nStarting predictions for audio files in ", prediction_dir, " ....\n")
+    for filename in full_file_names:
+        # Argument kaiser_fast to res_type is faster than 'kaiser_best'. To reduce the load time, passing kaiser_fast.
+        X1, _ = librosa.load(filename, res_type='kaiser_fast')
+        transformed_test_data = mfcc(mx.nd.array(X1))
+        output = net(transformed_test_data.reshape((1, -1)))
+        prediction = nd.argmax(output, axis=1)
+        print(filename, " -> ", synset[(int)(prediction.asscalar())])
+
+
+if __name__ == '__main__':
+    try:
+        import argparse
+        parser = argparse.ArgumentParser(description="Urban Sounds clsssification example - MXNet")
+        parser.add_argument('--pred', '-p', help="Enter the folder path that contains your audio files", type=str)
+        args = parser.parse_args()
+        pred_dir = args.pred
+
+    except ImportError:
+        warnings.warn("Argparse module not installed! passing default arguments.")
+        pred_dir = './Test'
+    predict(prediction_dir=pred_dir)
+    print("Urban sounds classification Prediction DONE!")