- Add index tto sentence_chunk token_ids

- Add some code to handle small chunk_size edge cases
charlesdeb · Apr 22, 2021 · 1ff44cb · 1ff44cb
1 parent 89b394a
commit 1ff44cb
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 3 deletions.
diff --git a/app/models/sentence_chunk.rb b/app/models/sentence_chunk.rb
@@ -98,11 +98,18 @@ def self.generate(params = {}) # rubocop:disable Metrics/MethodLength
 
     chunk_size, token_size, text_sample_id = extract_generate_params(params)
 
+    text_sample_text = TextSample.find(text_sample_id).text
+    text_sample_token_length = Token.split_into_tokens(text_sample_text).length
+
     output = []
 
     if chunk_size == 'all'
       CHUNK_SIZE_RANGE.each do |current_chunk_size|
-        output.push(generate_text(current_chunk_size, token_size, text_sample_id))
+        # handle edge case where text sample has less tokens than the chunk size
+        if current_chunk_size <= text_sample_token_length
+          output.push(generate_text(current_chunk_size, token_size,
+                                    text_sample_id))
+        end
       end
     else
       output.push(generate_text(chunk_size, token_size, text_sample_id))

diff --git a/app/models/word_chunk.rb b/app/models/word_chunk.rb
@@ -94,11 +94,17 @@ def self.generate(params = {}) # rubocop:disable Metrics/MethodLength
 
     chunk_size, output_size, text_sample_id = extract_generate_params(params)
 
+    text_sample_length = TextSample.find(text_sample_id).text.length
+
     output = []
 
     if chunk_size == 'all'
       CHUNK_SIZE_RANGE.each do |current_chunk_size|
-        output.push(generate_text(current_chunk_size, output_size, text_sample_id))
+        # handle edge case where text sample is shorter than the chunk size
+        if current_chunk_size <= text_sample_length
+          output.push(generate_text(current_chunk_size, output_size,
+                                    text_sample_id))
+        end
       end
     else
       output.push(generate_text(chunk_size, output_size, text_sample_id))
@@ -132,6 +138,9 @@ def self.generate_text(chunk_size, output_size, text_sample_id)
     output = word_chunk.text
     while output.size < output_size
       word_chunk = word_chunk.choose_next_word_chunk
+      # if we couldn't get a next chunk, then just leave it there
+      break unless word_chunk
+
       next_character = word_chunk.text[-1]
       output += next_character
     end

diff --git a/db/migrate/20210422214651_add_token_ids_index_to_sentence_chunks.rb b/db/migrate/20210422214651_add_token_ids_index_to_sentence_chunks.rb
@@ -0,0 +1,5 @@
+class AddTokenIdsIndexToSentenceChunks < ActiveRecord::Migration[6.0]
+  def change
+    add_index :sentence_chunks, :token_ids, using: 'gin'
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2021_04_17_225204) do
+ActiveRecord::Schema.define(version: 2021_04_22_214651) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "plpgsql"
@@ -24,6 +24,7 @@
     t.integer "token_ids", array: true
     t.index ["size"], name: "index_sentence_chunks_on_size"
     t.index ["text_sample_id"], name: "index_sentence_chunks_on_text_sample_id"
+    t.index ["token_ids"], name: "index_sentence_chunks_on_token_ids", using: :gin
   end
 
   create_table "settings", force: :cascade do |t|