Skip to content

Commit

Permalink
- Add index tto sentence_chunk token_ids
Browse files Browse the repository at this point in the history
- Add some code to handle small chunk_size edge cases
  • Loading branch information
charlesdeb committed Apr 22, 2021
1 parent 89b394a commit 1ff44cb
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 3 deletions.
9 changes: 8 additions & 1 deletion app/models/sentence_chunk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,18 @@ def self.generate(params = {}) # rubocop:disable Metrics/MethodLength

chunk_size, token_size, text_sample_id = extract_generate_params(params)

text_sample_text = TextSample.find(text_sample_id).text
text_sample_token_length = Token.split_into_tokens(text_sample_text).length

output = []

if chunk_size == 'all'
CHUNK_SIZE_RANGE.each do |current_chunk_size|
output.push(generate_text(current_chunk_size, token_size, text_sample_id))
# handle edge case where text sample has less tokens than the chunk size
if current_chunk_size <= text_sample_token_length
output.push(generate_text(current_chunk_size, token_size,
text_sample_id))
end
end
else
output.push(generate_text(chunk_size, token_size, text_sample_id))
Expand Down
11 changes: 10 additions & 1 deletion app/models/word_chunk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,17 @@ def self.generate(params = {}) # rubocop:disable Metrics/MethodLength

chunk_size, output_size, text_sample_id = extract_generate_params(params)

text_sample_length = TextSample.find(text_sample_id).text.length

output = []

if chunk_size == 'all'
CHUNK_SIZE_RANGE.each do |current_chunk_size|
output.push(generate_text(current_chunk_size, output_size, text_sample_id))
# handle edge case where text sample is shorter than the chunk size
if current_chunk_size <= text_sample_length
output.push(generate_text(current_chunk_size, output_size,
text_sample_id))
end
end
else
output.push(generate_text(chunk_size, output_size, text_sample_id))
Expand Down Expand Up @@ -132,6 +138,9 @@ def self.generate_text(chunk_size, output_size, text_sample_id)
output = word_chunk.text
while output.size < output_size
word_chunk = word_chunk.choose_next_word_chunk
# if we couldn't get a next chunk, then just leave it there
break unless word_chunk

next_character = word_chunk.text[-1]
output += next_character
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddTokenIdsIndexToSentenceChunks < ActiveRecord::Migration[6.0]
def change
add_index :sentence_chunks, :token_ids, using: 'gin'
end
end
3 changes: 2 additions & 1 deletion db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2021_04_17_225204) do
ActiveRecord::Schema.define(version: 2021_04_22_214651) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand All @@ -24,6 +24,7 @@
t.integer "token_ids", array: true
t.index ["size"], name: "index_sentence_chunks_on_size"
t.index ["text_sample_id"], name: "index_sentence_chunks_on_text_sample_id"
t.index ["token_ids"], name: "index_sentence_chunks_on_token_ids", using: :gin
end

create_table "settings", force: :cascade do |t|
Expand Down

0 comments on commit 1ff44cb

Please sign in to comment.