From 42be01dabf568e158bba20a4871e03795b1fc82e Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Wed, 19 Jun 2019 08:21:42 -0400 Subject: [PATCH 01/11] Add fastText to CNN text classification examples --- .../examples/cnn-text-classification/README.md | 13 +++++++++++++ .../src/cnn_text_classification/data_helper.clj | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index f2ed939bee16..191bf6fe029e 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -49,6 +49,19 @@ and then run - `lein uberjar` - `java -Xms1024m -Xmx2048m -jar target/cnn-text-classification-0.1.0-SNAPSHOT-standalone.jar` +## Usage with fastText + +Using fastText instead of glove is fairly straightforward, as the pretrained embedding format is very similar. + +Download the wiki news 300d 1M pre-trained word vectors from the fastText [site](https://fasttext.cc/docs/en/english-vectors.html). + +Unzip the word vectors and place them in the `data/fastText` directory. + +Then you can run training on a subset of examples through the repl using: +``` +(train-convnet {:embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) +``` + ## Usage with word2vec You can also use word2vec embeddings in order to train the text classification model. diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj index 82ba13087a37..4c5a3d84699d 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj +++ b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj @@ -33,6 +33,8 @@ [embedding-size] (format "data/glove/glove.6B.%dd.txt" embedding-size)) +(def fasttext-file-path "data/fastText/wiki-news-300d-1M.vec") + (defn r-string "Reads a string from the given DataInputStream `dis` until a space or newline is reached." [dis] @@ -100,6 +102,10 @@ (println "Loading the glove pre-trained word embeddings from " glove-file-path) (into {} (read-text-embedding-pairs (io/reader glove-file-path)))) +(defn load-fasttext [fasttext-file-path] + (println "Loading the fastText pre-trained word embeddings from " fasttext-file-path) + (into {} (read-text-embedding-pairs (io/reader fasttext-file-path)))) + (defn clean-str [s] (-> s (string/replace #"^A-Za-z0-9(),!?'`]" " ") @@ -190,6 +196,8 @@ vocab-embeddings (case pretrained-embedding :glove (->> (load-glove (glove-file-path embedding-size)) (build-vocab-embeddings vocab embedding-size)) + :fastText (->> (load-fasttext fasttext-file-path) + (build-vocab-embeddings vocab embedding-size)) :word2vec (->> (load-word2vec-model w2v-file-path embedding-size {:vocab vocab}) (:word2vec) (build-vocab-embeddings vocab embedding-size)) From e4beba8ca2322ec2757d17a81de601f2d8fe6c73 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Sun, 23 Jun 2019 12:30:37 -0400 Subject: [PATCH 02/11] Update repl running instructions --- .../examples/cnn-text-classification/README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index 191bf6fe029e..ac0b2d579b44 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -29,8 +29,7 @@ You also must download the glove word embeddings. The suggested one to use is th ## Usage You can run through the repl with -`(train-convnet {:embedding-size 50 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :glove})` - +`(train-convnet {:devs [(context/cpu 0)] :embedding-size 50 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :glove})` or `JVM_OPTS="-Xmx1g" lein run` (cpu) @@ -59,7 +58,7 @@ Unzip the word vectors and place them in the `data/fastText` directory. Then you can run training on a subset of examples through the repl using: ``` -(train-convnet {:embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) +(train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) ``` ## Usage with word2vec @@ -71,7 +70,7 @@ you'll need to unzip them and place them in the `contrib/clojure-package/data` d Then you can run training on a subset of examples through the repl using: ``` -(train-convnet {:embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :word2vec}) +(train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :word2vec}) ``` Note that loading word2vec embeddings consumes memory and takes some time. @@ -79,7 +78,7 @@ You can also train them using `JVM_OPTS="-Xmx8g" lein run` once you've modified the parameters to `train-convnet` (see above) in `src/cnn_text_classification/classifier.clj`. In order to run training with word2vec on the complete data set, you will need to run: ``` -(train-convnet {:embedding-size 300 :batch-size 100 :test-size 1000 :num-epoch 10 :pretrained-embedding :word2vec}) +(train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 1000 :num-epoch 10 :pretrained-embedding :word2vec}) ``` You should be able to achieve an accuracy of `~0.78` using the parameters above. From 5f421520a77ff4c26473d78b5f8328b5f690867f Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Sun, 23 Jun 2019 19:45:41 -0400 Subject: [PATCH 03/11] Complete solution with OOM workaround --- .../examples/cnn-text-classification/README.md | 3 +++ .../src/cnn_text_classification/data_helper.clj | 16 +++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index ac0b2d579b44..4d97f69fd183 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -56,6 +56,9 @@ Download the wiki news 300d 1M pre-trained word vectors from the fastText [site] Unzip the word vectors and place them in the `data/fastText` directory. +To prevent OOM errors, we will not use all the trained embeddings; instead we'll take +the first 10%: `head -n 100000 wiki-news-300d-1M.vec > wiki-news-300d-100K.vec`. + Then you can run training on a subset of examples through the repl using: ``` (train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj index 4c5a3d84699d..b0dcf9cdc7c7 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj +++ b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj @@ -33,7 +33,7 @@ [embedding-size] (format "data/glove/glove.6B.%dd.txt" embedding-size)) -(def fasttext-file-path "data/fastText/wiki-news-300d-1M.vec") +(def fasttext-file-path "data/fastText/wiki-news-300d-100K.vec") (defn r-string "Reads a string from the given DataInputStream `dis` until a space or newline is reached." @@ -77,8 +77,8 @@ _ (println "Processing with " {:dim dim :word-size word-size} " loading max vectors " max-vectors) _ (if (not= embedding-size dim) (throw (ex-info "Mismatch in embedding size" - {:input-embedding-size embedding-size - :word2vec-embedding-size dim}))) + {:input-embedding-size embedding-size + :word2vec-embedding-size dim}))) vectors (load-w2v-vectors dis dim max-vectors) word2vec (if vocab (->> vectors @@ -92,19 +92,21 @@ ([path embedding-size] (load-word2vec-model path embedding-size {:max-vectors 100}))) -(defn read-text-embedding-pairs [rdr] - (for [^String line (line-seq rdr) +(defn read-text-embedding-pairs [pairs] + (for [^String line pairs :let [fields (.split line " ")]] [(aget fields 0) (mapv #(Float/parseFloat ^String %) (rest fields))])) (defn load-glove [glove-file-path] (println "Loading the glove pre-trained word embeddings from " glove-file-path) - (into {} (read-text-embedding-pairs (io/reader glove-file-path)))) + (into {} (read-text-embedding-pairs (line-seq (io/reader glove-file-path))))) + +(def remove-fasttext-metadata rest) (defn load-fasttext [fasttext-file-path] (println "Loading the fastText pre-trained word embeddings from " fasttext-file-path) - (into {} (read-text-embedding-pairs (io/reader fasttext-file-path)))) + (into {} (read-text-embedding-pairs (remove-fasttext-metadata (line-seq (io/reader fasttext-file-path)))))) (defn clean-str [s] (-> s From add3ce9d6ec6dbae2e239af0e14793be7370f179 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Mon, 24 Jun 2019 14:53:53 -0400 Subject: [PATCH 04/11] Complete solution with smaller fastText dataset --- .../examples/cnn-text-classification/README.md | 9 +++------ .../src/cnn_text_classification/data_helper.clj | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index 4d97f69fd183..8b754eb70faa 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -52,12 +52,9 @@ and then run Using fastText instead of glove is fairly straightforward, as the pretrained embedding format is very similar. -Download the wiki news 300d 1M pre-trained word vectors from the fastText [site](https://fasttext.cc/docs/en/english-vectors.html). - -Unzip the word vectors and place them in the `data/fastText` directory. - -To prevent OOM errors, we will not use all the trained embeddings; instead we'll take -the first 10%: `head -n 100000 wiki-news-300d-1M.vec > wiki-news-300d-100K.vec`. +Download the 'Simple English' pretrained wiki word vectors (text) from the fastText +[site](https://fasttext.cc/docs/en/pretrained-vectors.html) and place them in the +`data/fastText` directory. Then you can run training on a subset of examples through the repl using: ``` diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj index b0dcf9cdc7c7..4ef61fcddb40 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj +++ b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj @@ -33,7 +33,7 @@ [embedding-size] (format "data/glove/glove.6B.%dd.txt" embedding-size)) -(def fasttext-file-path "data/fastText/wiki-news-300d-100K.vec") +(def fasttext-file-path "data/fastText/wiki.simple.vec") (defn r-string "Reads a string from the given DataInputStream `dis` until a space or newline is reached." From a8d3bf318514db4c82e293cd919494679b523445 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Mon, 24 Jun 2019 15:04:56 -0400 Subject: [PATCH 05/11] Add approx validation accuracy to readme --- .../clojure-package/examples/cnn-text-classification/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index 8b754eb70faa..9e1d8b89dd03 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -61,6 +61,8 @@ Then you can run training on a subset of examples through the repl using: (train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) ``` +Expect a validation accuracy of `~0.67` with the above parameters. + ## Usage with word2vec You can also use word2vec embeddings in order to train the text classification model. From 6fa3db3b2b2c42b2bc171635528697753e91891d Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Fri, 28 Jun 2019 11:29:07 -0400 Subject: [PATCH 06/11] Add threading macro --- .../src/cnn_text_classification/data_helper.clj | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj index 4ef61fcddb40..034a86697e47 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj +++ b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj @@ -100,13 +100,20 @@ (defn load-glove [glove-file-path] (println "Loading the glove pre-trained word embeddings from " glove-file-path) - (into {} (read-text-embedding-pairs (line-seq (io/reader glove-file-path))))) + (->> (io/reader glove-file-path) + line-seq + read-text-embedding-pairs + (into {}))) (def remove-fasttext-metadata rest) (defn load-fasttext [fasttext-file-path] (println "Loading the fastText pre-trained word embeddings from " fasttext-file-path) - (into {} (read-text-embedding-pairs (remove-fasttext-metadata (line-seq (io/reader fasttext-file-path)))))) + (->> (io/reader fasttext-file-path) + line-seq + remove-fasttext-metadata + read-text-embedding-pairs + (into {}))) (defn clean-str [s] (-> s From 0e94485a2d2d7b4c3b1acf6c4a2d1b93b780a601 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Fri, 28 Jun 2019 11:32:32 -0400 Subject: [PATCH 07/11] Use consistent fasttext casing --- .../src/cnn_text_classification/data_helper.clj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj index 034a86697e47..b6824212fb16 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj +++ b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj @@ -33,7 +33,7 @@ [embedding-size] (format "data/glove/glove.6B.%dd.txt" embedding-size)) -(def fasttext-file-path "data/fastText/wiki.simple.vec") +(def fasttext-file-path "data/fasttext/wiki.simple.vec") (defn r-string "Reads a string from the given DataInputStream `dis` until a space or newline is reached." @@ -205,7 +205,7 @@ vocab-embeddings (case pretrained-embedding :glove (->> (load-glove (glove-file-path embedding-size)) (build-vocab-embeddings vocab embedding-size)) - :fastText (->> (load-fasttext fasttext-file-path) + :fasttext (->> (load-fasttext fasttext-file-path) (build-vocab-embeddings vocab embedding-size)) :word2vec (->> (load-word2vec-model w2v-file-path embedding-size {:vocab vocab}) (:word2vec) From 2466e18beabc74f922d26fbf2416d2534731ebd9 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Fri, 28 Jun 2019 11:33:58 -0400 Subject: [PATCH 08/11] Add bangs to io reader functions --- .../src/cnn_text_classification/data_helper.clj | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj index b6824212fb16..df132c3167cd 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj +++ b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj @@ -64,7 +64,7 @@ vect (mapv (fn [_] (read-float dis)) (range embedding-size))] (cons [word vect] (lazy-seq (load-w2v-vectors dis embedding-size (dec num-vectors))))))) -(defn load-word2vec-model +(defn load-word2vec-model! "Loads the word2vec model stored in a binary format from the given `path`. By default only the first 100 embeddings are loaded." ([path embedding-size opts] @@ -90,7 +90,7 @@ (println "Finished") {:num-embed dim :word2vec word2vec}))) ([path embedding-size] - (load-word2vec-model path embedding-size {:max-vectors 100}))) + (load-word2vec-model! path embedding-size {:max-vectors 100}))) (defn read-text-embedding-pairs [pairs] (for [^String line pairs @@ -98,7 +98,7 @@ [(aget fields 0) (mapv #(Float/parseFloat ^String %) (rest fields))])) -(defn load-glove [glove-file-path] +(defn load-glove! [glove-file-path] (println "Loading the glove pre-trained word embeddings from " glove-file-path) (->> (io/reader glove-file-path) line-seq @@ -107,7 +107,7 @@ (def remove-fasttext-metadata rest) -(defn load-fasttext [fasttext-file-path] +(defn load-fasttext! [fasttext-file-path] (println "Loading the fastText pre-trained word embeddings from " fasttext-file-path) (->> (io/reader fasttext-file-path) line-seq @@ -203,11 +203,11 @@ sentences-padded (pad-sentences sentences) vocab (build-vocab sentences-padded) vocab-embeddings (case pretrained-embedding - :glove (->> (load-glove (glove-file-path embedding-size)) + :glove (->> (load-glove! (glove-file-path embedding-size)) (build-vocab-embeddings vocab embedding-size)) - :fasttext (->> (load-fasttext fasttext-file-path) + :fasttext (->> (load-fasttext! fasttext-file-path) (build-vocab-embeddings vocab embedding-size)) - :word2vec (->> (load-word2vec-model w2v-file-path embedding-size {:vocab vocab}) + :word2vec (->> (load-word2vec-model! w2v-file-path embedding-size {:vocab vocab}) (:word2vec) (build-vocab-embeddings vocab embedding-size)) vocab) From 74715345ebc2235171eb845655d2e76ac2876b91 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Fri, 28 Jun 2019 11:35:06 -0400 Subject: [PATCH 09/11] Reference default context setting in readme --- .../examples/cnn-text-classification/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index 9e1d8b89dd03..aa7640ff3190 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -29,7 +29,7 @@ You also must download the glove word embeddings. The suggested one to use is th ## Usage You can run through the repl with -`(train-convnet {:devs [(context/cpu 0)] :embedding-size 50 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :glove})` +`(train-convnet {:devs [(context/default-context)] :embedding-size 50 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :glove})` or `JVM_OPTS="-Xmx1g" lein run` (cpu) @@ -58,7 +58,7 @@ Download the 'Simple English' pretrained wiki word vectors (text) from the fastT Then you can run training on a subset of examples through the repl using: ``` -(train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) +(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) ``` Expect a validation accuracy of `~0.67` with the above parameters. @@ -72,7 +72,7 @@ you'll need to unzip them and place them in the `contrib/clojure-package/data` d Then you can run training on a subset of examples through the repl using: ``` -(train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :word2vec}) +(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :word2vec}) ``` Note that loading word2vec embeddings consumes memory and takes some time. @@ -80,7 +80,7 @@ You can also train them using `JVM_OPTS="-Xmx8g" lein run` once you've modified the parameters to `train-convnet` (see above) in `src/cnn_text_classification/classifier.clj`. In order to run training with word2vec on the complete data set, you will need to run: ``` -(train-convnet {:devs [(context/cpu 0)] :embedding-size 300 :batch-size 100 :test-size 1000 :num-epoch 10 :pretrained-embedding :word2vec}) +(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 1000 :num-epoch 10 :pretrained-embedding :word2vec}) ``` You should be able to achieve an accuracy of `~0.78` using the parameters above. From 9d89cbca131125aa73110c6987b34a54cece3dc9 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Fri, 28 Jun 2019 11:36:18 -0400 Subject: [PATCH 10/11] Change fasttext references in readme --- .../examples/cnn-text-classification/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index aa7640ff3190..cdae5ff0d308 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -54,11 +54,11 @@ Using fastText instead of glove is fairly straightforward, as the pretrained emb Download the 'Simple English' pretrained wiki word vectors (text) from the fastText [site](https://fasttext.cc/docs/en/pretrained-vectors.html) and place them in the -`data/fastText` directory. +`data/fasttext` directory. Then you can run training on a subset of examples through the repl using: ``` -(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fastText}) +(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fasttext}) ``` Expect a validation accuracy of `~0.67` with the above parameters. From f454f3cae46aa5521dff6bfea739736d93a9db79 Mon Sep 17 00:00:00 2001 From: Alexander Chalk Date: Fri, 28 Jun 2019 11:50:37 -0400 Subject: [PATCH 11/11] Add data fetching shellscript for fasttext --- .../cnn-text-classification/README.md | 2 +- .../get_fasttext_data.sh | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100755 contrib/clojure-package/examples/cnn-text-classification/get_fasttext_data.sh diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md index cdae5ff0d308..8f8e6200ec7c 100644 --- a/contrib/clojure-package/examples/cnn-text-classification/README.md +++ b/contrib/clojure-package/examples/cnn-text-classification/README.md @@ -54,7 +54,7 @@ Using fastText instead of glove is fairly straightforward, as the pretrained emb Download the 'Simple English' pretrained wiki word vectors (text) from the fastText [site](https://fasttext.cc/docs/en/pretrained-vectors.html) and place them in the -`data/fasttext` directory. +`data/fasttext` directory. Alternatively just run `./get_fasttext_data.sh`. Then you can run training on a subset of examples through the repl using: ``` diff --git a/contrib/clojure-package/examples/cnn-text-classification/get_fasttext_data.sh b/contrib/clojure-package/examples/cnn-text-classification/get_fasttext_data.sh new file mode 100755 index 000000000000..2bfe96659402 --- /dev/null +++ b/contrib/clojure-package/examples/cnn-text-classification/get_fasttext_data.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -evx + +mkdir -p data/fasttext +cd data/fasttext +wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec