- generate method for SentenceChunk

- and methods that support it - add token_size to settings - convert token_ids column from serialisable to array - all tests
charlesdeb · Apr 20, 2021 · 177c88b · 177c88b
1 parent 94f70e7
commit 177c88b
Show file tree

Hide file tree

Showing 7 changed files with 452 additions and 283 deletions.
diff --git a/app/models/sentence_chunk.rb b/app/models/sentence_chunk.rb
@@ -5,14 +5,14 @@
 
 # for a Sentence Chunk, a 'chunk' is an ordered collection of words, spaces and
 # punctuation (all called tokens)
-class SentenceChunk < ApplicationRecord
+class SentenceChunk < ApplicationRecord # rubocop:disable Metrics/ClassLength
   belongs_to :text_sample
 
-  validates :token_ids, presence: true # may normalise this later, or convert to a serializable column type
+  validates :token_ids, presence: true # may normalise this later
   validates :size, presence: true
   validates :count, presence: true
 
-  serialize(:token_ids, Array)
+  # serialize(:token_ids, Array)
 
   CHUNK_SIZE_RANGE = (2..8).freeze
 
@@ -66,7 +66,7 @@ def self.save_chunks(chunks_hash, text_sample_id, chunk_size, save_strategy = :i
     end
   end
 
-  def self.save_chunks_by_insert_all(  # rubocop:disable Metrics/MethodLength
+  def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
     chunks_hash, text_sample_id, chunk_size
   )
     current_time = DateTime.now
@@ -84,4 +84,114 @@ def self.save_chunks_by_insert_all(  # rubocop:disable Metrics/MethodLength
     end
     SentenceChunk.insert_all! import_array
   end
+
+  # Entry point for generating text using the sentence chunk strategy
+  #
+  # @param [Hash] params parameters to generate with
+  # @option [Integer] chunk_size chunk size to use for generation
+  # @option [Integer] token_size number of tokens to generate
+  # @option [Integer] text_sample_id TextSample to use as the model
+  def self.generate(params = {}) # rubocop:disable Metrics/MethodLength
+    unless chunks_built_for? params[:text_sample_id]
+      return { message: 'Sentence chunks have not been built for this text sample' }
+    end
+
+    chunk_size, token_size, text_sample_id = extract_generate_params(params)
+
+    output = []
+
+    if chunk_size == 'all'
+      CHUNK_SIZE_RANGE.each do |current_chunk_size|
+        output.push(generate_text(current_chunk_size, token_size, text_sample_id))
+      end
+    else
+      output.push(generate_text(chunk_size, token_size, text_sample_id))
+    end
+
+    { output: output }
+  end
+
+  # Helper method that pulls individual parameters out of params or sets
+  # reasonable defaults
+  # @param (see ::generate)
+  def self.extract_generate_params(params = {})
+    chunk_size =
+      if params[:chunk_size]
+         .to_i.zero?
+        Setting.chunk_size
+      else params[:chunk_size].to_i
+      end
+
+    token_size = if params[:token_size]
+                    .to_i.zero?
+                   Setting.token_size else params[:token_size].to_i end
+
+    [chunk_size, token_size, params[:text_sample_id]]
+  end
+
+  def self.chunks_built_for?(text_sample_id)
+    !SentenceChunk.find_by(text_sample_id: text_sample_id).nil?
+  end
+
+  def self.generate_text(chunk_size, token_size, text_sample_id)
+    chunk = choose_starting_chunk(text_sample_id, chunk_size)
+
+    output_token_ids = chunk.token_ids
+    while output_token_ids.size < token_size
+      chunk = chunk.choose_next_chunk
+      next_token_id = chunk.token_ids[-1]
+      output_token_ids << next_token_id
+    end
+
+    output = Token.replace_token_ids_with_tokens(output_token_ids).join
+
+    { text: output, chunk_size: chunk_size }
+  end
+
+  def self.choose_starting_chunk(text_sample_id, chunk_size)
+    candidates = SentenceChunk
+                 .where({ text_sample_id: text_sample_id, size: chunk_size })
+                 .limit(nil)
+    candidates[(rand * candidates.size).to_i]
+  end
+
+  # Choose the next word chunk after this one
+  def choose_next_chunk
+    token_ids_where = []
+
+    # grab all but the first token in the chunk
+    token_ids[1..].map.with_index do |token_id, index|
+      # and build a where clause so that all the tokens in the array match.
+      # Note: PostgreSQL arrays are 1-indexed and not 0-indexed
+      token_ids_where << "token_ids[#{index + 1}] = #{token_id}"
+    end
+    token_ids_where = token_ids_where.join(' AND ')
+
+    candidates = SentenceChunk
+                 .where("text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND #{token_ids_where}",
+                        text_sample_id: text_sample.id, sentence_chunk_size: size)
+                 .limit(nil)
+
+    SentenceChunk.choose_chunk_from_candidates(candidates)
+  end
+
+  def self.choose_chunk_from_candidates(candidates)
+    counts_array = SentenceChunk.build_counts_array(candidates)
+
+    counts_array[(rand * counts_array.size).to_i]
+  end
+
+  def self.build_counts_array(candidates)
+    counts_array = []
+    candidates.each do |chunk|
+      chunk.count.times { counts_array.push(chunk) }
+    end
+    counts_array
+  end
+
+  # helper method for converting an array of token_ids back to an array of
+  # readable text
+  def to_tokens
+    Token.replace_token_ids_with_tokens(token_ids)
+  end
 end
diff --git a/app/models/setting.rb b/app/models/setting.rb
@@ -7,6 +7,7 @@ class Setting < RailsSettings::Base
   field :generate_strategy, type: :string, default: 'word_chunk'
   field :chunk_size, type: :string, default: 'all'
   field :output_size, type: :integer, default: 250
+  field :token_size, type: :integer, default: 250
   field :prior_word_count, type: :string, default: 'all'
 
   # field :host, type: :string, default: "http://localhost:3000"

diff --git a/app/models/word_chunk.rb b/app/models/word_chunk.rb
@@ -61,7 +61,9 @@ def self.save_word_chunks(
     end
   end
 
-  def self.save_word_chunks_by_insert_all(chunks_hash, text_sample, chunk_size)
+  def self.save_word_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
+    chunks_hash, text_sample, chunk_size
+  )
     current_time = DateTime.now
     import_array = []
     chunks_hash.each do |chunk_text, count|

diff --git a/db/migrate/20210417225204_convert_token_ids_to_array.rb b/db/migrate/20210417225204_convert_token_ids_to_array.rb
@@ -0,0 +1,12 @@
+# This will destroy any data in the token_ids column of the sentence_chunks table
+class ConvertTokenIdsToArray < ActiveRecord::Migration[6.0]
+  def up
+    remove_column(:sentence_chunks, :token_ids)
+    add_column(:sentence_chunks, :token_ids, :integer, array: true)
+  end
+
+  def down
+    remove_column(:sentence_chunks, :token_ids)
+    add_column(:sentence_chunks, :token_ids, :text)
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2021_04_14_221302) do
+ActiveRecord::Schema.define(version: 2021_04_17_225204) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "plpgsql"
@@ -19,12 +19,11 @@
     t.integer "size", null: false
     t.integer "count", null: false
     t.bigint "text_sample_id", null: false
-    t.string "token_ids"
     t.datetime "created_at", precision: 6, null: false
     t.datetime "updated_at", precision: 6, null: false
+    t.integer "token_ids", array: true
     t.index ["size"], name: "index_sentence_chunks_on_size"
     t.index ["text_sample_id"], name: "index_sentence_chunks_on_text_sample_id"
-    t.index ["token_ids"], name: "index_sentence_chunks_on_token_ids"
   end
 
   create_table "settings", force: :cascade do |t|

diff --git a/spec/factories/sentence_chunk.rb b/spec/factories/sentence_chunk.rb
@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+
+FactoryBot.define do
+  factory :sentence_chunk, class: SentenceChunk do
+    size { 2 }
+    token_ids { [1, 2] }
+    count { 1 }
+    # text_sample_two_chars
+    association :text_sample, factory: :text_sample_two_chars
+  end
+end