From 1f7ab70cd4dbb64e16bb6b38840490c2f2259cb0 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 24 Oct 2024 15:39:23 -0700 Subject: [PATCH] Disable bilceaner hard rules completely (#892) --- pipeline/bicleaner/bicleaner.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pipeline/bicleaner/bicleaner.sh b/pipeline/bicleaner/bicleaner.sh index de075533b..af7920372 100755 --- a/pipeline/bicleaner/bicleaner.sh +++ b/pipeline/bicleaner/bicleaner.sh @@ -49,12 +49,6 @@ else export scol=2 export tcol=1 fi - # disable hard rules for multilingual model - if [ ${model_source_lang} == "xx" ] || [ ${model_target_lang} == "xx" ]; then - export hardrules="--disable_hardrules" - else - export hardrules="" - fi #Export cuda visible devices if empty or not set if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then @@ -76,7 +70,7 @@ else # to operate on the CPU very slowly. To guard against this wasting expensive # GPU time, always check that it can find GPUs. python3 -c "import tensorflow; exit(0) if tensorflow.config.list_physical_devices('GPU') else exit(9001)" - bicleaner-ai-classify ${hardrules} --scol ${scol} --tcol ${tcol} - - $1 + bicleaner-ai-classify --disable_hardrules --scol ${scol} --tcol ${tcol} - - $1 } export -f biclean # {%} is a 1-indexed job slot number from GNU parallel. We use that as the 1-indexed offset in CUDA_VISIBLE_ARRAY @@ -86,7 +80,7 @@ else else export BICLEANER_AI_THREADS=${threads} paste <(zstdmt -dc "${corpus_prefix}.${SRC}.zst") <(zstdmt -dc "${corpus_prefix}.${TRG}.zst") | - bicleaner-ai-classify ${hardrules} --scol ${scol} --tcol ${tcol} "${threads}" - - "${pack_dir}"/*.yaml | + bicleaner-ai-classify --disable_hardrules --scol ${scol} --tcol ${tcol} "${threads}" - - "${pack_dir}"/*.yaml | zstdmt >"${output_prefix}.scored.zst" fi