From 177c88b1c6b8930d896bd7ff9372e186e7b9245b Mon Sep 17 00:00:00 2001
From: Charles de Bueger <chabekah@gmail.com>
Date: Tue, 20 Apr 2021 22:58:44 +0100
Subject: [PATCH] - generate method for SentenceChunk - and methods that
 support it - add token_size to settings - convert token_ids column from
 serialisable to array - all tests

---
 app/models/sentence_chunk.rb                  | 118 +++-
 app/models/setting.rb                         |   1 +
 app/models/word_chunk.rb                      |   4 +-
 ...210417225204_convert_token_ids_to_array.rb |  12 +
 db/schema.rb                                  |   5 +-
 spec/factories/sentence_chunk.rb              |  11 +
 spec/models/sentence_chunk_spec.rb            | 584 +++++++++---------
 7 files changed, 452 insertions(+), 283 deletions(-)
 create mode 100644 db/migrate/20210417225204_convert_token_ids_to_array.rb
 create mode 100644 spec/factories/sentence_chunk.rb

diff --git a/app/models/sentence_chunk.rb b/app/models/sentence_chunk.rb
index 2ab2786..b6afb34 100644
--- a/app/models/sentence_chunk.rb
+++ b/app/models/sentence_chunk.rb
@@ -5,14 +5,14 @@
 
 # for a Sentence Chunk, a 'chunk' is an ordered collection of words, spaces and
 # punctuation (all called tokens)
-class SentenceChunk < ApplicationRecord
+class SentenceChunk < ApplicationRecord # rubocop:disable Metrics/ClassLength
   belongs_to :text_sample
 
-  validates :token_ids, presence: true # may normalise this later, or convert to a serializable column type
+  validates :token_ids, presence: true # may normalise this later
   validates :size, presence: true
   validates :count, presence: true
 
-  serialize(:token_ids, Array)
+  # serialize(:token_ids, Array)
 
   CHUNK_SIZE_RANGE = (2..8).freeze
 
@@ -66,7 +66,7 @@ def self.save_chunks(chunks_hash, text_sample_id, chunk_size, save_strategy = :i
     end
   end
 
-  def self.save_chunks_by_insert_all(  # rubocop:disable Metrics/MethodLength
+  def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
     chunks_hash, text_sample_id, chunk_size
   )
     current_time = DateTime.now
@@ -84,4 +84,114 @@ def self.save_chunks_by_insert_all(  # rubocop:disable Metrics/MethodLength
     end
     SentenceChunk.insert_all! import_array
   end
+
+  # Entry point for generating text using the sentence chunk strategy
+  #
+  # @param [Hash] params parameters to generate with
+  # @option [Integer] chunk_size chunk size to use for generation
+  # @option [Integer] token_size number of tokens to generate
+  # @option [Integer] text_sample_id TextSample to use as the model
+  def self.generate(params = {}) # rubocop:disable Metrics/MethodLength
+    unless chunks_built_for? params[:text_sample_id]
+      return { message: 'Sentence chunks have not been built for this text sample' }
+    end
+
+    chunk_size, token_size, text_sample_id = extract_generate_params(params)
+
+    output = []
+
+    if chunk_size == 'all'
+      CHUNK_SIZE_RANGE.each do |current_chunk_size|
+        output.push(generate_text(current_chunk_size, token_size, text_sample_id))
+      end
+    else
+      output.push(generate_text(chunk_size, token_size, text_sample_id))
+    end
+
+    { output: output }
+  end
+
+  # Helper method that pulls individual parameters out of params or sets
+  # reasonable defaults
+  # @param (see ::generate)
+  def self.extract_generate_params(params = {})
+    chunk_size =
+      if params[:chunk_size]
+         .to_i.zero?
+        Setting.chunk_size
+      else params[:chunk_size].to_i
+      end
+
+    token_size = if params[:token_size]
+                    .to_i.zero?
+                   Setting.token_size else params[:token_size].to_i end
+
+    [chunk_size, token_size, params[:text_sample_id]]
+  end
+
+  def self.chunks_built_for?(text_sample_id)
+    !SentenceChunk.find_by(text_sample_id: text_sample_id).nil?
+  end
+
+  def self.generate_text(chunk_size, token_size, text_sample_id)
+    chunk = choose_starting_chunk(text_sample_id, chunk_size)
+
+    output_token_ids = chunk.token_ids
+    while output_token_ids.size < token_size
+      chunk = chunk.choose_next_chunk
+      next_token_id = chunk.token_ids[-1]
+      output_token_ids << next_token_id
+    end
+
+    output = Token.replace_token_ids_with_tokens(output_token_ids).join
+
+    { text: output, chunk_size: chunk_size }
+  end
+
+  def self.choose_starting_chunk(text_sample_id, chunk_size)
+    candidates = SentenceChunk
+                 .where({ text_sample_id: text_sample_id, size: chunk_size })
+                 .limit(nil)
+    candidates[(rand * candidates.size).to_i]
+  end
+
+  # Choose the next word chunk after this one
+  def choose_next_chunk
+    token_ids_where = []
+
+    # grab all but the first token in the chunk
+    token_ids[1..].map.with_index do |token_id, index|
+      # and build a where clause so that all the tokens in the array match.
+      # Note: PostgreSQL arrays are 1-indexed and not 0-indexed
+      token_ids_where << "token_ids[#{index + 1}] = #{token_id}"
+    end
+    token_ids_where = token_ids_where.join(' AND ')
+
+    candidates = SentenceChunk
+                 .where("text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND #{token_ids_where}",
+                        text_sample_id: text_sample.id, sentence_chunk_size: size)
+                 .limit(nil)
+
+    SentenceChunk.choose_chunk_from_candidates(candidates)
+  end
+
+  def self.choose_chunk_from_candidates(candidates)
+    counts_array = SentenceChunk.build_counts_array(candidates)
+
+    counts_array[(rand * counts_array.size).to_i]
+  end
+
+  def self.build_counts_array(candidates)
+    counts_array = []
+    candidates.each do |chunk|
+      chunk.count.times { counts_array.push(chunk) }
+    end
+    counts_array
+  end
+
+  # helper method for converting an array of token_ids back to an array of
+  # readable text
+  def to_tokens
+    Token.replace_token_ids_with_tokens(token_ids)
+  end
 end
diff --git a/app/models/setting.rb b/app/models/setting.rb
index 1b5d56e..5cac966 100644
--- a/app/models/setting.rb
+++ b/app/models/setting.rb
@@ -7,6 +7,7 @@ class Setting < RailsSettings::Base
   field :generate_strategy, type: :string, default: 'word_chunk'
   field :chunk_size, type: :string, default: 'all'
   field :output_size, type: :integer, default: 250
+  field :token_size, type: :integer, default: 250
   field :prior_word_count, type: :string, default: 'all'
 
   # field :host, type: :string, default: "http://localhost:3000"
diff --git a/app/models/word_chunk.rb b/app/models/word_chunk.rb
index 1189a92..2709722 100644
--- a/app/models/word_chunk.rb
+++ b/app/models/word_chunk.rb
@@ -61,7 +61,9 @@ def self.save_word_chunks(
     end
   end
 
-  def self.save_word_chunks_by_insert_all(chunks_hash, text_sample, chunk_size)
+  def self.save_word_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
+    chunks_hash, text_sample, chunk_size
+  )
     current_time = DateTime.now
     import_array = []
     chunks_hash.each do |chunk_text, count|
diff --git a/db/migrate/20210417225204_convert_token_ids_to_array.rb b/db/migrate/20210417225204_convert_token_ids_to_array.rb
new file mode 100644
index 0000000..e627adf
--- /dev/null
+++ b/db/migrate/20210417225204_convert_token_ids_to_array.rb
@@ -0,0 +1,12 @@
+# This will destroy any data in the token_ids column of the sentence_chunks table
+class ConvertTokenIdsToArray < ActiveRecord::Migration[6.0]
+  def up
+    remove_column(:sentence_chunks, :token_ids)
+    add_column(:sentence_chunks, :token_ids, :integer, array: true)
+  end
+
+  def down
+    remove_column(:sentence_chunks, :token_ids)
+    add_column(:sentence_chunks, :token_ids, :text)
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
index ebc6d23..b8a5d95 100644
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2021_04_14_221302) do
+ActiveRecord::Schema.define(version: 2021_04_17_225204) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "plpgsql"
@@ -19,12 +19,11 @@
     t.integer "size", null: false
     t.integer "count", null: false
     t.bigint "text_sample_id", null: false
-    t.string "token_ids"
     t.datetime "created_at", precision: 6, null: false
     t.datetime "updated_at", precision: 6, null: false
+    t.integer "token_ids", array: true
     t.index ["size"], name: "index_sentence_chunks_on_size"
     t.index ["text_sample_id"], name: "index_sentence_chunks_on_text_sample_id"
-    t.index ["token_ids"], name: "index_sentence_chunks_on_token_ids"
   end
 
   create_table "settings", force: :cascade do |t|
diff --git a/spec/factories/sentence_chunk.rb b/spec/factories/sentence_chunk.rb
new file mode 100644
index 0000000..43f2a64
--- /dev/null
+++ b/spec/factories/sentence_chunk.rb
@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+
+FactoryBot.define do
+  factory :sentence_chunk, class: SentenceChunk do
+    size { 2 }
+    token_ids { [1, 2] }
+    count { 1 }
+    # text_sample_two_chars
+    association :text_sample, factory: :text_sample_two_chars
+  end
+end
diff --git a/spec/models/sentence_chunk_spec.rb b/spec/models/sentence_chunk_spec.rb
index a766be4..6bc04af 100644
--- a/spec/models/sentence_chunk_spec.rb
+++ b/spec/models/sentence_chunk_spec.rb
@@ -195,6 +195,7 @@
     let(:text_sample) do
       TextSample.create!(description: 'Longer sample', text: long_string)
     end
+    let(:text_sample_token_ids) { Token.id_ise(text_sample.text, :sentence) }
 
     describe '[behaviour]' do
       let(:chunk_size) { 2 }
@@ -236,10 +237,10 @@
 
       context 'chunk size of 2', chunk_size: 2 do
         let(:chunk_size) { 2 }
-        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) }
+        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) }
         it 'uses insert_all for individual sentence_chunks' do
           SentenceChunk
-            .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all)
+            .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all)
         end
 
         # it 'uses individual create! for each sentence_chunk' do
@@ -250,10 +251,10 @@
 
       context 'chunk size of 3', chunk_size: 3 do
         let(:chunk_size) { 3 }
-        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) }
+        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) }
         it 'uses insert_all for individual sentence_chunks' do
           SentenceChunk
-            .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all)
+            .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all)
         end
         # it 'uses individual create! for each sentence_chunk' do
         #   SentenceChunk
@@ -263,10 +264,10 @@
 
       context 'chunk size of 4', chunk_size: 4 do
         let(:chunk_size) { 4 }
-        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) }
+        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) }
         it 'uses insert_all for individual sentence_chunks' do
           SentenceChunk
-            .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all)
+            .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all)
         end
         # it 'uses individual create! for each sentence_chunk' do
         #   SentenceChunk
@@ -276,10 +277,10 @@
 
       context 'chunk size of 8', chunk_size: 8 do
         let(:chunk_size) { 8 }
-        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) }
+        let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) }
         it 'uses insert_all for individual sentence_chunks' do
           SentenceChunk
-            .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all)
+            .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all)
         end
         # it 'uses individual create! for each sentence_chunk' do
         #   SentenceChunk
@@ -306,271 +307,304 @@
     end
   end
 
-  #   let(:text_sample) do
-  #     TextSample.create!(description: 'Stuff', text: 'another man')
-  #   end
-  #   let(:chunk_size) { 3 }
-  #   let(:output_size) { 5 }
-  #   let(:generate_params) do
-  #     { chunk_size: chunk_size,
-  #       output_size: output_size,
-  #       text_sample_id: text_sample.id }
-  #   end
-
-  #   it 'checks whether SentenceChunks have been generated for given TextSample' do
-  #     allow(SentenceChunk).to receive(:chunks_built_for?)
-  #     SentenceChunk.generate generate_params
-  #     expect(SentenceChunk).to have_received(:chunks_built_for?)
-  #   end
-
-  #   context 'SentenceChunks have not been generated' do
-  #     it 'returns a warning' do
-  #       allow(SentenceChunk).to receive(:chunks_built_for?).and_return(false)
-  #       result = SentenceChunk.generate generate_params
-  #       expect(result[:message])
-  #         .not_to be(nil)
-  #       expect(result[:message])
-  #         .to match(/Word chunks have not been built for this text sample/)
-  #     end
-  #   end
-  #     let(:generated_text) { 'some text' }
-  #     before(:each) do
-  #       allow(SentenceChunk).to receive(:chunks_built_for?).and_return(true)
-  #       allow(SentenceChunk)
-  #         .to receive(:generate_text)
-  #         .and_return({ text: generated_text, chunk_size: chunk_size })
-  #       allow(SentenceChunk)
-  #         .to receive(:extract_generate_params)
-  #         .and_return([chunk_size, output_size, text_sample.id])
-  #     end
-
-  #     it 'extracts generate parameters' do
-  #       # allow(SentenceChunk)
-  #       #   .to receive(:extract_generate_params)
-  #       #   .and_return([output_size, chunk_size, text_sample.id])
-
-  #       SentenceChunk.generate generate_params
-
-  #       expect(SentenceChunk)
-  #         .to have_received(:extract_generate_params)
-  #     end
-
-  #     context 'for one chunk_size' do
-  #       let(:generation_result) do
-  #         { output: [{ text: generated_text, chunk_size: chunk_size }] }
-  #       end
-
-  #       it 'generates the text' do
-  #         SentenceChunk.generate generate_params
-  #         expect(SentenceChunk)
-  #           .to have_received(:generate_text)
-  #           .with(chunk_size, output_size, text_sample.id)
-  #       end
-
-  #       it 'returns a hash with the generated text' do
-  #         result = SentenceChunk.generate generate_params
-
-  #         expect(result).to eq(generation_result)
-  #       end
-  #     end
-
-  #     context 'for all chunk_sizes' do
-  #       let(:chunk_size) { 'all' }
-  #       let(:generate_params) do
-  #         { chunk_size: :chunk_size,
-  #           output_size: output_size,
-  #           text_sample_id: text_sample.id }
-  #       end
-
-  #       before(:each) do
-  #         allow(SentenceChunk)
-  #           .to receive(:extract_generate_params)
-  #           .and_return([chunk_size, output_size, text_sample.id])
-  #       end
-
-  #       it 'generates the right number of texts' do
-  #         SentenceChunk.generate generate_params
-  #         expect(SentenceChunk)
-  #           .to have_received(:generate_text)
-  #           .exactly(SentenceChunk::CHUNK_SIZE_RANGE.size).times
-  #       end
-
-  #       it 'returns a hash with the generated text' do
-  #         result = SentenceChunk.generate generate_params
-
-  #         expect(result[:output].size).to eq(SentenceChunk::CHUNK_SIZE_RANGE.size)
-  #       end
-  #     end
-  #   end
-  # end
-
-  # describe '::extract_generate_params' do
-  #   let(:text_sample) do
-  #     TextSample.create!(description: 'Stuff', text: 'another man')
-  #   end
-  #   let(:chunk_size) { 3 }
-  #   let(:output_size) { 5 }
-  #   let(:generate_params) do
-  #     { chunk_size: chunk_size,
-  #       output_size: output_size,
-  #       text_sample_id: text_sample.id }
-  #   end
-
-  #   it 'uses default chunk_size and output size if no params provided' do
-  #     e_chunk_size, e_output_size = SentenceChunk.extract_generate_params
-
-  #     expect(e_chunk_size).to eq(Setting.chunk_size)
-  #     expect(e_output_size).to eq(Setting.output_size)
-  #   end
-
-  #   it 'extracts params' do
-  #     e_chunk_size, e_output_size, e_text_sample_id = SentenceChunk
-  #                                                     .extract_generate_params generate_params
-
-  #     expect(e_chunk_size).to eq(chunk_size)
-  #     expect(e_output_size).to eq(output_size)
-  #     expect(e_text_sample_id).to eq(text_sample.id)
-  #   end
-  # end
-  #   let(:text_sample) do
-  #     TextSample.create!(description: 'Stuff', text: 'another man')
-  #   end
-
-  #   let(:chunk_size) { 3 }
-  #   let(:output_size) { 5 }
-  #   let(:sentence_chunk) { double('SentenceChunk') }
-
-  #   before(:each) do
-  #     allow(SentenceChunk)
-  #       .to receive(:choose_starting_chunk).and_return(sentence_chunk)
-  #     allow(sentence_chunk)
-  #       .to receive(:text).and_return('abc')
-  #     allow(sentence_chunk)
-  #       .to receive(:choose_next_chunk).and_return(sentence_chunk)
-  #   end
-
-  #   it 'chooses a starting chunk' do
-  #     SentenceChunk.generate_text(chunk_size, output_size, text_sample.id)
-
-  #     expect(SentenceChunk)
-  #       .to(have_received(:choose_starting_chunk)
-  #       .with(text_sample.id, chunk_size))
-  #   end
-
-  #   it 'generates the right number of extra tokens' do
-  #     SentenceChunk.generate_text(chunk_size, output_size, text_sample.id)
-
-  #     expect(sentence_chunk)
-  #       .to(have_received(:choose_next_chunk).twice)
-  #   end
-
-  #   it 'returns the right length of output text' do
-  #     result = SentenceChunk.generate_text(chunk_size, output_size, text_sample.id)
-
-  #     expect(result[:text].size).to eq(5)
-  #   end
-
-  #   it 'returns a hash with the right keys' do
-  #     result = SentenceChunk.generate_text(chunk_size, output_size, text_sample.id)
-  #     expect(result).to have_key(:chunk_size)
-  #     expect(result).to have_key(:text)
-  #   end
-  # end
-
-  # describe '::choose_starting_chunk' do
-  #   let(:chunk_size) { 3 }
-  #   let(:text_sample) do
-  #     TextSample.create!(description: 'Stuff', text: 'mice')
-  #   end
-
-  #   before(:each) do
-  #     SentenceChunk.count_chunks(text_sample, chunk_size)
-  #   end
-
-  #   it 'all SentenceChunks are potential candidates' do
-  #     candidates = %w[mic ice]
-
-  #     # if we run this 100 times, it's pretty unlikely we won't get both of
-  #     # these
-  #     100.times do
-  #       candidate = SentenceChunk.choose_starting_chunk(
-  #         text_sample.id, chunk_size
-  #       )
-  #       candidates.delete(candidate.text) if candidates.include?(candidate.text)
-  #       break if candidates.empty?
-  #     end
-  #     expect(candidates).to eq([])
-  #   end
-
-  #   it 'returns a SentenceChunk' do
-  #     result = SentenceChunk.choose_starting_chunk(text_sample.id, chunk_size)
-  #     expect(result).to be_instance_of(SentenceChunk)
-  #   end
-  # end
-
-  # describe '#choose_next_chunk' do
-  #   let(:where_chain) { double('WhereChain') }
-  #   let(:sentence_chunk) { create(:sentence_chunk) }
-  #   let(:candidates) { double('candidates') }
-
-  #   before(:each) do
-  #     allow(SentenceChunk).to receive(:choose_chunk_from_candidates)
-  #     allow(SentenceChunk)
-  #       .to receive(:where).and_return(where_chain)
-
-  #     allow(where_chain).to receive(:limit).and_return(candidates)
-
-  #     sentence_chunk.choose_next_chunk
-  #   end
-
-  #   it 'finds candidate word chunks' do
-  #     expect(SentenceChunk)
-  #       .to(have_received(:where)
-  #       .with('text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND text LIKE :chunk_head',
-  #             { chunk_head: 't%', text_sample_id: sentence_chunk.text_sample_id,
-  #               sentence_chunk_size: 2 }))
-  #   end
-
-  #   it 'chooses word chunk from candidates' do
-  #     expect(SentenceChunk)
-  #       .to(
-  #         have_received(:choose_chunk_from_candidates).with(candidates)
-  #       )
-  #   end
-  # end
-
-  # describe '::choose_chunk_from_candidates' do
-  #   let(:counts_array) { [build(:sentence_chunk), build(:sentence_chunk)] }
-  #   let(:candidates) { double('candidates') }
-
-  #   before(:each) do
-  #     allow(SentenceChunk)
-  #       .to receive(:build_counts_array).and_return(counts_array)
-  #   end
-
-  #   it 'calculates probabilities of each word chunk' do
-  #     SentenceChunk.choose_chunk_from_candidates(candidates)
-
-  #     expect(SentenceChunk)
-  #       .to(have_received(:build_counts_array).with(candidates))
-  #   end
-
-  #   it 'selects a word chunk' do
-  #     new_chunk = SentenceChunk.choose_chunk_from_candidates(candidates)
-
-  #     expect(new_chunk).to be_instance_of(SentenceChunk)
-  #   end
-  # end
-
-  # describe '::build_counts_array' do
-  #   let!(:sentence_chunk_at) { create(:sentence_chunk, text: 'at', count: 2) }
-  #   let!(:sentence_chunk_an) { create(:sentence_chunk, text: 'an', count: 1) }
-  #   let(:candidates) { SentenceChunk.all }
-
-  #   it 'has the right number of elements' do
-  #     counts_array = SentenceChunk.build_counts_array(candidates)
-
-  #     expect(counts_array.size).to eq(3)
-  #   end
-  # end
+  describe '::generate' do # rubocop:disable Metrics/BlockLength
+    let(:text_sample) do
+      TextSample.create!(description: 'Stuff', text: 'another man')
+    end
+    let(:chunk_size) { 3 }
+    let(:token_size) { 5 }
+
+    let(:generate_params) do
+      { chunk_size: chunk_size,
+        token_size: token_size,
+        text_sample_id: text_sample.id }
+    end
+
+    it 'checks whether SentenceChunks have been generated for given TextSample' do
+      allow(SentenceChunk).to receive(:chunks_built_for?)
+      allow(SentenceChunk).to receive(:generate_text)
+        .and_return({ text: 'some generated text',
+                      chunk_size: 'some size' })
+      SentenceChunk.generate generate_params
+      expect(SentenceChunk).to have_received(:chunks_built_for?)
+    end
+
+    context 'SentenceChunks have not been generated' do
+      it 'returns a warning' do
+        allow(SentenceChunk).to receive(:chunks_built_for?)
+          .with(anything)
+          .and_return(false)
+        result = SentenceChunk.generate generate_params
+        expect(result[:message])
+          .not_to be(nil)
+        expect(result[:message])
+          .to match(/Sentence chunks have not been built for this text sample/)
+      end
+    end
+
+    context 'SentenceChunks have been generated' do # rubocop:disable Metrics/BlockLength
+      let(:generated_token_ids) { [1, 2, 3] }
+      let(:generated_tokens) { ['some', ' ', 'text'] }
+      let(:generated_text) { 'some text' }
+      before(:each) do
+        allow(SentenceChunk).to receive(:chunks_built_for?).and_return(true)
+        allow(SentenceChunk)
+          .to receive(:generate_text)
+          .and_return({ text: generated_text, chunk_size: chunk_size })
+        allow(SentenceChunk)
+          .to receive(:extract_generate_params)
+          .and_return([chunk_size, token_size, text_sample.id])
+        # allow(Token)
+        #   .to receive(:replace_token_ids_with_tokens)
+        #   .and_return(generated_tokens)
+      end
+
+      it 'extracts generate parameters' do
+        SentenceChunk.generate generate_params
+
+        expect(SentenceChunk)
+          .to have_received(:extract_generate_params)
+      end
+
+      context 'for one chunk_size' do
+        let(:generation_result) do
+          { output: [{ text: generated_text, chunk_size: chunk_size }] }
+        end
+
+        it 'generates the tokens' do
+          SentenceChunk.generate generate_params
+          expect(SentenceChunk)
+            .to have_received(:generate_text)
+            .with(chunk_size, token_size, text_sample.id)
+        end
+
+        it 'returns a hash with the generated text' do
+          result = SentenceChunk.generate generate_params
+
+          expect(result).to eq(generation_result)
+        end
+      end
+
+      context 'for all chunk_sizes' do
+        let(:chunk_size) { 'all' }
+        let(:generate_params) do
+          { chunk_size: :chunk_size,
+            token_size: token_size,
+            text_sample_id: text_sample.id }
+        end
+
+        before(:each) do
+          allow(SentenceChunk)
+            .to receive(:extract_generate_params)
+            .and_return([chunk_size, token_size, text_sample.id])
+        end
+
+        it 'generates the right number of texts' do
+          SentenceChunk.generate generate_params
+          expect(SentenceChunk)
+            .to have_received(:generate_text)
+            .exactly(SentenceChunk::CHUNK_SIZE_RANGE.size).times
+        end
+
+        it 'returns a hash with the generated text' do
+          result = SentenceChunk.generate generate_params
+
+          expect(result[:output].size).to eq(SentenceChunk::CHUNK_SIZE_RANGE.size)
+        end
+      end
+    end
+  end
+
+  describe '::chunks_built_for?' do
+    it 'returns true if built' do
+      allow(SentenceChunk).to receive(:find_by).and_return 'something'
+
+      expect(SentenceChunk.chunks_built_for?(-100)).to be true
+    end
+
+    it 'returns false if not built' do
+      allow(SentenceChunk).to receive(:find_by).and_return nil
+
+      expect(SentenceChunk.chunks_built_for?(-100)).to be false
+    end
+  end
+
+  describe '::extract_generate_params' do
+    let(:text_sample) { TextSample.create!(description: 'Stuff', text: 'another man') }
+    let(:chunk_size) { 3 }
+    let(:token_size) { 5 }
+    let(:generate_params) do
+      { chunk_size: chunk_size,
+        token_size: token_size,
+        text_sample_id: text_sample.id }
+    end
+
+    it 'uses default chunk_size and token_size if no params provided' do
+      e_chunk_size, e_token_size = SentenceChunk.extract_generate_params
+
+      expect(e_chunk_size).to eq(Setting.chunk_size)
+      expect(e_token_size).to eq(Setting.token_size)
+    end
+
+    it 'extracts params' do
+      e_chunk_size, e_token_size, e_text_sample_id = SentenceChunk
+                                                     .extract_generate_params generate_params
+
+      expect(e_chunk_size).to eq(chunk_size)
+      expect(e_token_size).to eq(token_size)
+      expect(e_text_sample_id).to eq(text_sample.id)
+    end
+  end
+
+  describe '::generate_text' do # rubocop:disable Metrics/BlockLength
+    let(:text_sample) do
+      TextSample.create!(description: 'Stuff', text: 'another man')
+    end
+
+    let(:chunk_size) { 3 }
+    let(:token_size) { 5 }
+    let(:sentence_chunk) { double('SentenceChunk') }
+
+    before(:each) do
+      allow(SentenceChunk)
+        .to receive(:choose_starting_chunk).and_return(sentence_chunk)
+      allow(sentence_chunk)
+        .to receive(:token_ids).and_return([1, 2, 3])
+      allow(sentence_chunk)
+        .to receive(:choose_next_chunk).and_return(sentence_chunk)
+      allow(Token)
+        .to receive(:replace_token_ids_with_tokens)
+        .and_return(['another', ' ', 'man'])
+    end
+
+    it 'chooses a starting chunk' do
+      SentenceChunk.generate_text(chunk_size, token_size, text_sample.id)
+
+      expect(SentenceChunk)
+        .to(have_received(:choose_starting_chunk)
+        .with(text_sample.id, chunk_size))
+    end
+
+    it 'generates the right number of extra tokens' do
+      SentenceChunk.generate_text(chunk_size, token_size, text_sample.id)
+
+      expect(sentence_chunk)
+        .to(have_received(:choose_next_chunk).twice)
+    end
+
+    it 'returns a hash with the right keys' do
+      result = SentenceChunk.generate_text(chunk_size, token_size, text_sample.id)
+      expect(result).to have_key(:chunk_size)
+      expect(result).to have_key(:text)
+    end
+  end
+
+  describe '::choose_starting_chunk' do
+    let(:chunk_size) { 3 }
+    let(:text_sample) do
+      TextSample.create!(description: 'Stuff', text: 'take me to the river')
+    end
+    let(:text_sample_token_ids) { Token.id_ise(text_sample.text, :sentence) }
+
+    before(:each) do
+      SentenceChunk.count_chunks(text_sample_token_ids, text_sample.id, chunk_size)
+    end
+
+    it 'all SentenceChunks are potential candidates' do
+      candidates = ['take me', ' me ', 'me to', ' to ', 'to the', ' the ', 'the river']
+
+      # if we run this 100 times, it's pretty unlikely we won't get both of
+      # these chunks
+      100.times do
+        candidate_token_ids = SentenceChunk.choose_starting_chunk(
+          text_sample.id, chunk_size
+        ).token_ids
+        candidate_text = Token.replace_token_ids_with_tokens(candidate_token_ids).join
+        candidates.delete(candidate_text) if candidates.include?(candidate_text)
+        break if candidates.empty?
+      end
+      expect(candidates).to eq([])
+    end
+
+    it 'returns a SentenceChunk' do
+      result = SentenceChunk.choose_starting_chunk(text_sample.id, chunk_size)
+      expect(result).to be_instance_of(SentenceChunk)
+    end
+  end
+
+  describe '#choose_next_chunk' do
+    let(:where_chain) { double('WhereChain') }
+    let(:sentence_chunk) { create(:sentence_chunk) }
+    let(:candidates) { double('candidates') }
+
+    before(:each) do
+      allow(SentenceChunk).to receive(:choose_chunk_from_candidates)
+      allow(SentenceChunk)
+        .to receive(:where).and_return(where_chain)
+
+      allow(where_chain).to receive(:limit).and_return(candidates)
+
+      sentence_chunk.choose_next_chunk
+    end
+
+    it 'finds candidate sentence chunks' do
+      expect(SentenceChunk)
+        .to(have_received(:where)
+        .with('text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND token_ids[0] = 2',
+              { text_sample_id: sentence_chunk.text_sample_id,
+                sentence_chunk_size: 2 }))
+    end
+
+    it 'chooses word chunk from candidates' do
+      expect(SentenceChunk)
+        .to(
+          have_received(:choose_chunk_from_candidates).with(candidates)
+        )
+    end
+  end
+
+  describe '::choose_chunk_from_candidates' do
+    let(:counts_array) { [build(:sentence_chunk), build(:sentence_chunk)] }
+    let(:candidates) { double('candidates') }
+
+    before(:each) do
+      allow(SentenceChunk)
+        .to receive(:build_counts_array).and_return(counts_array)
+    end
+
+    it 'calculates probabilities of each word chunk' do
+      SentenceChunk.choose_chunk_from_candidates(candidates)
+
+      expect(SentenceChunk)
+        .to(have_received(:build_counts_array).with(candidates))
+    end
+
+    it 'selects a word chunk' do
+      new_chunk = SentenceChunk.choose_chunk_from_candidates(candidates)
+
+      expect(new_chunk).to be_instance_of(SentenceChunk)
+    end
+  end
+
+  describe '::build_counts_array' do
+    let!(:sentence_chunk_1_2) { create(:sentence_chunk, token_ids: [1, 2], count: 2) } # rubocop:disable Naming/VariableNumber
+    let!(:sentence_chunk_1_3) { create(:sentence_chunk, token_ids: [1, 3], count: 1) } # rubocop:disable Naming/VariableNumber
+    let(:candidates) { SentenceChunk.all }
+
+    it 'has the right number of elements' do
+      counts_array = SentenceChunk.build_counts_array(candidates)
+
+      expect(counts_array.size).to eq(3)
+    end
+  end
+
+  describe '#to_tokens' do
+    it 'converts an array of token ids to the text of the tokens' do
+      text_sample = TextSample.create!(description: 'Stuff', text: 'take me to the river')
+      text_sample.analyse
+
+      sentence_chunk = SentenceChunk.where("text_sample_id = #{text_sample.id}").first
+      expect(sentence_chunk.to_tokens).to eq(['take', ' '])
+    end
+  end
 end