From 0b203b420f0fe17637f540903bb82877415c9e79 Mon Sep 17 00:00:00 2001
From: Rudolf A Braun <rab014@gmail.com>
Date: Sat, 9 Nov 2019 12:36:00 +0000
Subject: [PATCH 1/3] [egs] split_scp.pl when using utt2dur will notice when
 certain splits are lacking data and will try and redistribute them more
 evenly.

---
 egs/wsj/s5/utils/split_scp.pl | 107 +++++++++++++++++++++++++++-------
 1 file changed, 85 insertions(+), 22 deletions(-)

diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 3ca14dbea7e..433cc5676dc 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -117,8 +117,12 @@
         @A = split;
         @A == 2 || die "Bad line $_ in utt2spk file $utt2dur_file";
         ($u,$d) = @A;
-        $utt2dur{$u} = $d;
         $dursum += $d;
+        $s = $utt2spk{$u};
+        if (!defined $spk2dur{$s}) {
+            $spk2dur{$s} = 0.0;
+        }
+        $spk2dur{$s} += $d;
     }
     open(I, "<$inscp") || die "Opening input scp file $inscp";
     @spkrs = ();
@@ -149,6 +153,7 @@
     }
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
         $scparray[$scpidx] = []; # [] is array reference.
+        $scp2dur[$scpidx] = 0.0;
     }
     $splitdur = $dursum / $numscps;
     $dursum = 0.0;
@@ -156,38 +161,96 @@
     for my $spk (sort (keys %spk2utt)) {
         $scpcount[$scpidx] += $spk_count{$spk};
         push @{$scparray[$scpidx]}, $spk;
-        for my $utt (@{$spk2utt{$spk}}) {
-            $dur = $utt2dur{$utt};
-            $dursum += $dur;
-        }
+        $dur = $spk2dur{$spk};
+        $dursum += $dur;
         if ( $dursum >= $splitdur ) {
+            $scp2dur[$scpidx] = $dursum;
             $scpidx += 1;
             $dursum = 0.0;
         }
     }
 
-    # Because scpidx might not have gone up to numscps (because all utts from one
-    # speaker go into one split means a major imbalance will mean not all splits
-    # are filled), move one speaker inside scparray to the indices which don't have
-    # any.
-    if ( $scpidx + 1 < $numscps || @{$scparray[$scpidx]} == 0 ) {
-        $scpdone = $scpidx;
-        if ( @{$scparray[$scpidx]} == 0 ) {
-            $scpdone -= 1;
+    # Adjust split if necessary.
+    #
+    # It should be easy to understand that for datasets where for example
+    # one speaker is 90% of the data, it will be impossible to
+    # create equally sized splits.
+    #
+    # Not only that, but because speakers must go into the same split,
+    # one split will be much larger than average, resulting in other splits
+    # getting no speakers (and no data) at all. The below code tries
+    # to fix that situation in a general way so that there should be some
+    # data in each split.
+
+    # Indices which were already used and should be skipped
+    my %argmax_used;
+    for($i = 0; $i < int($numscps/2); $i++) {
+        # Iterate through scps and find difference between actual and target duration
+        $split_with_zero_exists = 0;
+        my @diffs;
+        for($j = 0; $j < $numscps; $j++) {
+            $surplus = $scp2dur[$j] - $splitdur;
+            push @diffs, $surplus;
+            $cnt = @{$scparray[$scpidx]};
+            if ($cnt == 0) {
+                $split_with_zero_exists = 1;
+            }
         }
-        for(; $scpidx < $numscps; $scpidx++) {
-            $i = 0;
-            for(; $i < $scpdone; $i++) {
-                $numspk = @{$scparray[$i]};
+
+        # Find min and max of surpluses.
+        $min = 1.0;
+        $max = -1.0;
+        $argmin = 0;
+        $argmax = 0;
+        for($j = 0; $j < $numscps; $j++) {
+            $surplus = $diffs[$j];
+            if ($surplus < $min) {
+                $min = $surplus;
+                $argmin = $j;
+            }
+            if ($surplus > $max && !exists($argmax_used{$j})) {
+                $numspk = @{$scparray[$j]};
                 if ($numspk > 1) {
-                    last;
+                    $max = $surplus;
+                    $argmax = $j;
                 }
             }
-            $spk = pop @{$scparray[$i]};
-            $scpcount[$i] -= $spk_count{$spk};
+        }
+
+        # Difference smaller than this is considered okay.
+        # +10 for weird cases of tiny datasets.
+        $min_surplus = $splitdur / 10.0 + 10.0;
+        if ($min > -$min_surplus && $max < $min_surplus && !$split_with_zero_exists) {
+            last;
+        }
+
+        $numspk = @{$scparray[$argmax]};
+        # Find speakers to move
+        for($j = 0; $j < $numspk;) {
+            my $s = $scparray[$argmax][$j];
+            $d = $spk2dur{$s};
+            # 100.0 to allow for some slack
+            if ($d < $max && $min + $d < 100.0) {
+                splice @{$scparray[$argmax]}, $j, 1;
+                $scpcount[$argmax] -= $spk_count{$s};
+                $scp2dur[$argmax] -= $d;
+                $max -= $d;
+                $numspk--;
 
-            push @{$scparray[$scpidx]}, $spk;
-            $scpcount[$scpidx] += $spk_count{$spk};
+                push @{$scparray[$argmin]}, $s;
+                $min += $d;
+                $scp2dur[$argmin] += $d;
+                $scpcount[$argmin] += $spk_count{$s};
+            } else {
+                $j++;
+            }
+            if ($j >= $numspk - 1) {
+                # Reached the end, should not use this argmax again
+                $argmax_used{$argmax} = 1;
+            }
+            if (($max < $min_surplus && $min > -$min_surplus) || $numspk == 1) {
+                last;
+            }
         }
     }
 

From 6f4ace12e202f5ded3548474d7ed3536ee97ef85 Mon Sep 17 00:00:00 2001
From: Rudolf A Braun <rab014@gmail.com>
Date: Mon, 11 Nov 2019 14:50:29 +0000
Subject: [PATCH 2/3] [egs] simplifying split_scp.pl, will die with error
 message when there is an imbalance in resulting split.

---
 egs/wsj/s5/utils/split_data.sh |  15 ++++-
 egs/wsj/s5/utils/split_scp.pl  | 119 ++++++++++-----------------------
 2 files changed, 49 insertions(+), 85 deletions(-)

diff --git a/egs/wsj/s5/utils/split_data.sh b/egs/wsj/s5/utils/split_data.sh
index a3105351a32..75577bd8d89 100755
--- a/egs/wsj/s5/utils/split_data.sh
+++ b/egs/wsj/s5/utils/split_data.sh
@@ -16,13 +16,18 @@
 # limitations under the License.
 
 split_per_spk=true
+allow_uneven_split=false
 if [ "$1" == "--per-utt" ]; then
   split_per_spk=false
   shift
 fi
+if [ "$1" == "--allow-uneven-split" ]; then
+    allow_uneven_split=true
+    shift
+fi
 
 if [ $# != 2 ]; then
-  echo "Usage: $0 [--per-utt] <data-dir> <num-to-split>"
+  echo "Usage: $0 [--per-utt] [--allow-uneven-split] <data-dir> <num-to-split>"
   echo "E.g.: $0 data/train 50"
   echo "It creates its output in e.g. data/train/split50/{1,2,3,...50}, or if the "
   echo "--per-utt option was given, in e.g. data/train/split50utt/{1,2,3,...50}."
@@ -30,6 +35,7 @@ if [ $# != 2 ]; then
   echo "This script will not split the data-dir if it detects that the output is newer than the input."
   echo "By default it splits per speaker (so each speaker is in only one split dir),"
   echo "but with the --per-utt option it will ignore the speaker information while splitting."
+  echo "To avoid crash caused by splitting imbalanced data use --allow-uneven-split"
   exit 1
 fi
 
@@ -118,7 +124,12 @@ fi
 which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
 trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM
 
-utils/split_scp.pl $utt2spk_opt $utt2dur_opt $data/utt2spk $utt2spks || exit 1
+even_split_opt=""
+if $allow_uneven_split; then
+    even_split_opt="--allow-uneven-split"
+fi
+
+utils/split_scp.pl $even_split_opt $utt2spk_opt $utt2dur_opt $data/utt2spk $utt2spks || exit 1
 
 for n in `seq $numsplit`; do
   dsn=$data/split${numsplit}${utt}/$n
diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 433cc5676dc..68098440262 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -30,6 +30,11 @@
 # this case, if there are more chunks than speakers (and in some other
 # circumstances), some of the resulting chunks will be empty and it will print
 # an error message and exit with nonzero status.
+# With the --utt2dur (and --utt2spk) option it will try and create equal size
+# chunks by duration. This can cause issues when there is a severe imbalance
+# in the data (extreme example, 90% of the data is one speaker), in which case
+# the script will stop with an error message. This behaviour can be overriden
+# with --allow-uneven-splits.
 # You will normally call this like:
 # split_scp.pl scp scp.1 scp.2 scp.3 ...
 # or
@@ -49,6 +54,7 @@
 $utt2spk_file = "";
 $utt2dur_file = "";
 $one_based = 0;
+$allow_uneven_splits = 0;
 
 for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
     if ($ARGV[0] eq "-j") {
@@ -70,6 +76,10 @@
         $one_based = 1;
         shift @ARGV;
     }
+    if ($ARGV[0] eq '--allow-uneven-splits') {
+        $allow_uneven_splits = 1;
+        shift @ARGV;
+    }
 }
 
 if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
@@ -84,8 +94,8 @@
 
 if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
     die
-"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp out1.scp out2.scp ...
-   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp [out.scp]
+"Usage: split_scp.pl [--allow-uneven-splits] [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--allow-uneven-splits] [--one-based] [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp [out.scp]
  ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
 }
 
@@ -158,99 +168,42 @@
     $splitdur = $dursum / $numscps;
     $dursum = 0.0;
     $scpidx = 0;
+    $dursum_current = 0.0;
     for my $spk (sort (keys %spk2utt)) {
         $scpcount[$scpidx] += $spk_count{$spk};
         push @{$scparray[$scpidx]}, $spk;
         $dur = $spk2dur{$spk};
         $dursum += $dur;
-        if ( $dursum >= $splitdur ) {
-            $scp2dur[$scpidx] = $dursum;
+        $dursum_current += $dur;
+        if ($dursum >= $splitdur * ($scpidx + 1) && $dursum_current > 10.0) {
+            $scp2dur[$scpidx] = $dursum_current;
             $scpidx += 1;
-            $dursum = 0.0;
-        }
-    }
-
-    # Adjust split if necessary.
-    #
-    # It should be easy to understand that for datasets where for example
-    # one speaker is 90% of the data, it will be impossible to
-    # create equally sized splits.
-    #
-    # Not only that, but because speakers must go into the same split,
-    # one split will be much larger than average, resulting in other splits
-    # getting no speakers (and no data) at all. The below code tries
-    # to fix that situation in a general way so that there should be some
-    # data in each split.
-
-    # Indices which were already used and should be skipped
-    my %argmax_used;
-    for($i = 0; $i < int($numscps/2); $i++) {
-        # Iterate through scps and find difference between actual and target duration
-        $split_with_zero_exists = 0;
-        my @diffs;
-        for($j = 0; $j < $numscps; $j++) {
-            $surplus = $scp2dur[$j] - $splitdur;
-            push @diffs, $surplus;
-            $cnt = @{$scparray[$scpidx]};
-            if ($cnt == 0) {
-                $split_with_zero_exists = 1;
+            $dursum_current = 0.0;
+            if ($scpidx >= $numscps) {
+                last;
             }
         }
+    }
 
-        # Find min and max of surpluses.
-        $min = 1.0;
-        $max = -1.0;
-        $argmin = 0;
-        $argmax = 0;
-        for($j = 0; $j < $numscps; $j++) {
-            $surplus = $diffs[$j];
-            if ($surplus < $min) {
-                $min = $surplus;
-                $argmin = $j;
-            }
-            if ($surplus > $max && !exists($argmax_used{$j})) {
-                $numspk = @{$scparray[$j]};
-                if ($numspk > 1) {
-                    $max = $surplus;
-                    $argmax = $j;
-                }
-            }
+    $smallest_dur = $splitdur;
+    $largest_dur = $splitdur;
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scpdur = $scp2dur[$scpidx];
+        if ($scpdur > $largest_dur) {
+            $largest_dur = $scpdur;
         }
-
-        # Difference smaller than this is considered okay.
-        # +10 for weird cases of tiny datasets.
-        $min_surplus = $splitdur / 10.0 + 10.0;
-        if ($min > -$min_surplus && $max < $min_surplus && !$split_with_zero_exists) {
-            last;
+        if ($scpdur < $smallest_dur) {
+            $smallest_dur = $scpdur;
         }
+    }
 
-        $numspk = @{$scparray[$argmax]};
-        # Find speakers to move
-        for($j = 0; $j < $numspk;) {
-            my $s = $scparray[$argmax][$j];
-            $d = $spk2dur{$s};
-            # 100.0 to allow for some slack
-            if ($d < $max && $min + $d < 100.0) {
-                splice @{$scparray[$argmax]}, $j, 1;
-                $scpcount[$argmax] -= $spk_count{$s};
-                $scp2dur[$argmax] -= $d;
-                $max -= $d;
-                $numspk--;
-
-                push @{$scparray[$argmin]}, $s;
-                $min += $d;
-                $scp2dur[$argmin] += $d;
-                $scpcount[$argmin] += $spk_count{$s};
-            } else {
-                $j++;
-            }
-            if ($j >= $numspk - 1) {
-                # Reached the end, should not use this argmax again
-                $argmax_used{$argmax} = 1;
-            }
-            if (($max < $min_surplus && $min > -$min_surplus) || $numspk == 1) {
-                last;
-            }
+    if ($allow_uneven_splits != 1) {
+        if (($smallest_dur < $largest_dur / 2 && $largest_dur > 3600) ||
+            $smallest_dur == 0.0) {
+            die "Trying to split data while taking duration into account leads to a " .
+                "severe imbalance in splits. This happens when there is a lot more data " .
+                "for some speakers than for others.\n" .
+                "You should use utils/data/modify_speaker_duration.sh to fix that.\n"
         }
     }
 

From 34deb91dbddd3231670e03a62291a39841004d33 Mon Sep 17 00:00:00 2001
From: Rudolf A Braun <rab014@gmail.com>
Date: Tue, 12 Nov 2019 21:24:45 +0000
Subject: [PATCH 3/3] [egs] Minor fixes to split_scp.pl

---
 egs/wsj/s5/utils/split_scp.pl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 68098440262..a25dbf55a5d 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -54,9 +54,9 @@
 $utt2spk_file = "";
 $utt2dur_file = "";
 $one_based = 0;
-$allow_uneven_splits = 0;
+$allow_uneven_split = 0;
 
-for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+for ($x = 1; $x <= 4 && @ARGV > 0; $x++) {
     if ($ARGV[0] eq "-j") {
         shift @ARGV;
         $num_jobs = shift @ARGV;
@@ -76,8 +76,8 @@
         $one_based = 1;
         shift @ARGV;
     }
-    if ($ARGV[0] eq '--allow-uneven-splits') {
-        $allow_uneven_splits = 1;
+    if ($ARGV[0] eq '--allow-uneven-split') {
+        $allow_uneven_split = 1;
         shift @ARGV;
     }
 }
@@ -197,7 +197,7 @@
         }
     }
 
-    if ($allow_uneven_splits != 1) {
+    if ($allow_uneven_split != 1) {
         if (($smallest_dur < $largest_dur / 2 && $largest_dur > 3600) ||
             $smallest_dur == 0.0) {
             die "Trying to split data while taking duration into account leads to a " .