From 177c88b1c6b8930d896bd7ff9372e186e7b9245b Mon Sep 17 00:00:00 2001 From: Charles de Bueger Date: Tue, 20 Apr 2021 22:58:44 +0100 Subject: [PATCH] - generate method for SentenceChunk - and methods that support it - add token_size to settings - convert token_ids column from serialisable to array - all tests --- app/models/sentence_chunk.rb | 118 +++- app/models/setting.rb | 1 + app/models/word_chunk.rb | 4 +- ...210417225204_convert_token_ids_to_array.rb | 12 + db/schema.rb | 5 +- spec/factories/sentence_chunk.rb | 11 + spec/models/sentence_chunk_spec.rb | 584 +++++++++--------- 7 files changed, 452 insertions(+), 283 deletions(-) create mode 100644 db/migrate/20210417225204_convert_token_ids_to_array.rb create mode 100644 spec/factories/sentence_chunk.rb diff --git a/app/models/sentence_chunk.rb b/app/models/sentence_chunk.rb index 2ab2786..b6afb34 100644 --- a/app/models/sentence_chunk.rb +++ b/app/models/sentence_chunk.rb @@ -5,14 +5,14 @@ # for a Sentence Chunk, a 'chunk' is an ordered collection of words, spaces and # punctuation (all called tokens) -class SentenceChunk < ApplicationRecord +class SentenceChunk < ApplicationRecord # rubocop:disable Metrics/ClassLength belongs_to :text_sample - validates :token_ids, presence: true # may normalise this later, or convert to a serializable column type + validates :token_ids, presence: true # may normalise this later validates :size, presence: true validates :count, presence: true - serialize(:token_ids, Array) + # serialize(:token_ids, Array) CHUNK_SIZE_RANGE = (2..8).freeze @@ -66,7 +66,7 @@ def self.save_chunks(chunks_hash, text_sample_id, chunk_size, save_strategy = :i end end - def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength + def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength chunks_hash, text_sample_id, chunk_size ) current_time = DateTime.now @@ -84,4 +84,114 @@ def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength end SentenceChunk.insert_all! import_array end + + # Entry point for generating text using the sentence chunk strategy + # + # @param [Hash] params parameters to generate with + # @option [Integer] chunk_size chunk size to use for generation + # @option [Integer] token_size number of tokens to generate + # @option [Integer] text_sample_id TextSample to use as the model + def self.generate(params = {}) # rubocop:disable Metrics/MethodLength + unless chunks_built_for? params[:text_sample_id] + return { message: 'Sentence chunks have not been built for this text sample' } + end + + chunk_size, token_size, text_sample_id = extract_generate_params(params) + + output = [] + + if chunk_size == 'all' + CHUNK_SIZE_RANGE.each do |current_chunk_size| + output.push(generate_text(current_chunk_size, token_size, text_sample_id)) + end + else + output.push(generate_text(chunk_size, token_size, text_sample_id)) + end + + { output: output } + end + + # Helper method that pulls individual parameters out of params or sets + # reasonable defaults + # @param (see ::generate) + def self.extract_generate_params(params = {}) + chunk_size = + if params[:chunk_size] + .to_i.zero? + Setting.chunk_size + else params[:chunk_size].to_i + end + + token_size = if params[:token_size] + .to_i.zero? + Setting.token_size else params[:token_size].to_i end + + [chunk_size, token_size, params[:text_sample_id]] + end + + def self.chunks_built_for?(text_sample_id) + !SentenceChunk.find_by(text_sample_id: text_sample_id).nil? + end + + def self.generate_text(chunk_size, token_size, text_sample_id) + chunk = choose_starting_chunk(text_sample_id, chunk_size) + + output_token_ids = chunk.token_ids + while output_token_ids.size < token_size + chunk = chunk.choose_next_chunk + next_token_id = chunk.token_ids[-1] + output_token_ids << next_token_id + end + + output = Token.replace_token_ids_with_tokens(output_token_ids).join + + { text: output, chunk_size: chunk_size } + end + + def self.choose_starting_chunk(text_sample_id, chunk_size) + candidates = SentenceChunk + .where({ text_sample_id: text_sample_id, size: chunk_size }) + .limit(nil) + candidates[(rand * candidates.size).to_i] + end + + # Choose the next word chunk after this one + def choose_next_chunk + token_ids_where = [] + + # grab all but the first token in the chunk + token_ids[1..].map.with_index do |token_id, index| + # and build a where clause so that all the tokens in the array match. + # Note: PostgreSQL arrays are 1-indexed and not 0-indexed + token_ids_where << "token_ids[#{index + 1}] = #{token_id}" + end + token_ids_where = token_ids_where.join(' AND ') + + candidates = SentenceChunk + .where("text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND #{token_ids_where}", + text_sample_id: text_sample.id, sentence_chunk_size: size) + .limit(nil) + + SentenceChunk.choose_chunk_from_candidates(candidates) + end + + def self.choose_chunk_from_candidates(candidates) + counts_array = SentenceChunk.build_counts_array(candidates) + + counts_array[(rand * counts_array.size).to_i] + end + + def self.build_counts_array(candidates) + counts_array = [] + candidates.each do |chunk| + chunk.count.times { counts_array.push(chunk) } + end + counts_array + end + + # helper method for converting an array of token_ids back to an array of + # readable text + def to_tokens + Token.replace_token_ids_with_tokens(token_ids) + end end diff --git a/app/models/setting.rb b/app/models/setting.rb index 1b5d56e..5cac966 100644 --- a/app/models/setting.rb +++ b/app/models/setting.rb @@ -7,6 +7,7 @@ class Setting < RailsSettings::Base field :generate_strategy, type: :string, default: 'word_chunk' field :chunk_size, type: :string, default: 'all' field :output_size, type: :integer, default: 250 + field :token_size, type: :integer, default: 250 field :prior_word_count, type: :string, default: 'all' # field :host, type: :string, default: "http://localhost:3000" diff --git a/app/models/word_chunk.rb b/app/models/word_chunk.rb index 1189a92..2709722 100644 --- a/app/models/word_chunk.rb +++ b/app/models/word_chunk.rb @@ -61,7 +61,9 @@ def self.save_word_chunks( end end - def self.save_word_chunks_by_insert_all(chunks_hash, text_sample, chunk_size) + def self.save_word_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength + chunks_hash, text_sample, chunk_size + ) current_time = DateTime.now import_array = [] chunks_hash.each do |chunk_text, count| diff --git a/db/migrate/20210417225204_convert_token_ids_to_array.rb b/db/migrate/20210417225204_convert_token_ids_to_array.rb new file mode 100644 index 0000000..e627adf --- /dev/null +++ b/db/migrate/20210417225204_convert_token_ids_to_array.rb @@ -0,0 +1,12 @@ +# This will destroy any data in the token_ids column of the sentence_chunks table +class ConvertTokenIdsToArray < ActiveRecord::Migration[6.0] + def up + remove_column(:sentence_chunks, :token_ids) + add_column(:sentence_chunks, :token_ids, :integer, array: true) + end + + def down + remove_column(:sentence_chunks, :token_ids) + add_column(:sentence_chunks, :token_ids, :text) + end +end diff --git a/db/schema.rb b/db/schema.rb index ebc6d23..b8a5d95 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2021_04_14_221302) do +ActiveRecord::Schema.define(version: 2021_04_17_225204) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -19,12 +19,11 @@ t.integer "size", null: false t.integer "count", null: false t.bigint "text_sample_id", null: false - t.string "token_ids" t.datetime "created_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false + t.integer "token_ids", array: true t.index ["size"], name: "index_sentence_chunks_on_size" t.index ["text_sample_id"], name: "index_sentence_chunks_on_text_sample_id" - t.index ["token_ids"], name: "index_sentence_chunks_on_token_ids" end create_table "settings", force: :cascade do |t| diff --git a/spec/factories/sentence_chunk.rb b/spec/factories/sentence_chunk.rb new file mode 100644 index 0000000..43f2a64 --- /dev/null +++ b/spec/factories/sentence_chunk.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +FactoryBot.define do + factory :sentence_chunk, class: SentenceChunk do + size { 2 } + token_ids { [1, 2] } + count { 1 } + # text_sample_two_chars + association :text_sample, factory: :text_sample_two_chars + end +end diff --git a/spec/models/sentence_chunk_spec.rb b/spec/models/sentence_chunk_spec.rb index a766be4..6bc04af 100644 --- a/spec/models/sentence_chunk_spec.rb +++ b/spec/models/sentence_chunk_spec.rb @@ -195,6 +195,7 @@ let(:text_sample) do TextSample.create!(description: 'Longer sample', text: long_string) end + let(:text_sample_token_ids) { Token.id_ise(text_sample.text, :sentence) } describe '[behaviour]' do let(:chunk_size) { 2 } @@ -236,10 +237,10 @@ context 'chunk size of 2', chunk_size: 2 do let(:chunk_size) { 2 } - let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) } + let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) } it 'uses insert_all for individual sentence_chunks' do SentenceChunk - .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all) + .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all) end # it 'uses individual create! for each sentence_chunk' do @@ -250,10 +251,10 @@ context 'chunk size of 3', chunk_size: 3 do let(:chunk_size) { 3 } - let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) } + let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) } it 'uses insert_all for individual sentence_chunks' do SentenceChunk - .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all) + .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all) end # it 'uses individual create! for each sentence_chunk' do # SentenceChunk @@ -263,10 +264,10 @@ context 'chunk size of 4', chunk_size: 4 do let(:chunk_size) { 4 } - let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) } + let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) } it 'uses insert_all for individual sentence_chunks' do SentenceChunk - .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all) + .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all) end # it 'uses individual create! for each sentence_chunk' do # SentenceChunk @@ -276,10 +277,10 @@ context 'chunk size of 8', chunk_size: 8 do let(:chunk_size) { 8 } - let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample.text, chunk_size) } + let(:chunks_hash) { SentenceChunk.build_chunks_hash(text_sample_token_ids, chunk_size) } it 'uses insert_all for individual sentence_chunks' do SentenceChunk - .save_chunks(chunks_hash, text_sample, chunk_size, :insert_all) + .save_chunks(chunks_hash, text_sample.id, chunk_size, :insert_all) end # it 'uses individual create! for each sentence_chunk' do # SentenceChunk @@ -306,271 +307,304 @@ end end - # let(:text_sample) do - # TextSample.create!(description: 'Stuff', text: 'another man') - # end - # let(:chunk_size) { 3 } - # let(:output_size) { 5 } - # let(:generate_params) do - # { chunk_size: chunk_size, - # output_size: output_size, - # text_sample_id: text_sample.id } - # end - - # it 'checks whether SentenceChunks have been generated for given TextSample' do - # allow(SentenceChunk).to receive(:chunks_built_for?) - # SentenceChunk.generate generate_params - # expect(SentenceChunk).to have_received(:chunks_built_for?) - # end - - # context 'SentenceChunks have not been generated' do - # it 'returns a warning' do - # allow(SentenceChunk).to receive(:chunks_built_for?).and_return(false) - # result = SentenceChunk.generate generate_params - # expect(result[:message]) - # .not_to be(nil) - # expect(result[:message]) - # .to match(/Word chunks have not been built for this text sample/) - # end - # end - # let(:generated_text) { 'some text' } - # before(:each) do - # allow(SentenceChunk).to receive(:chunks_built_for?).and_return(true) - # allow(SentenceChunk) - # .to receive(:generate_text) - # .and_return({ text: generated_text, chunk_size: chunk_size }) - # allow(SentenceChunk) - # .to receive(:extract_generate_params) - # .and_return([chunk_size, output_size, text_sample.id]) - # end - - # it 'extracts generate parameters' do - # # allow(SentenceChunk) - # # .to receive(:extract_generate_params) - # # .and_return([output_size, chunk_size, text_sample.id]) - - # SentenceChunk.generate generate_params - - # expect(SentenceChunk) - # .to have_received(:extract_generate_params) - # end - - # context 'for one chunk_size' do - # let(:generation_result) do - # { output: [{ text: generated_text, chunk_size: chunk_size }] } - # end - - # it 'generates the text' do - # SentenceChunk.generate generate_params - # expect(SentenceChunk) - # .to have_received(:generate_text) - # .with(chunk_size, output_size, text_sample.id) - # end - - # it 'returns a hash with the generated text' do - # result = SentenceChunk.generate generate_params - - # expect(result).to eq(generation_result) - # end - # end - - # context 'for all chunk_sizes' do - # let(:chunk_size) { 'all' } - # let(:generate_params) do - # { chunk_size: :chunk_size, - # output_size: output_size, - # text_sample_id: text_sample.id } - # end - - # before(:each) do - # allow(SentenceChunk) - # .to receive(:extract_generate_params) - # .and_return([chunk_size, output_size, text_sample.id]) - # end - - # it 'generates the right number of texts' do - # SentenceChunk.generate generate_params - # expect(SentenceChunk) - # .to have_received(:generate_text) - # .exactly(SentenceChunk::CHUNK_SIZE_RANGE.size).times - # end - - # it 'returns a hash with the generated text' do - # result = SentenceChunk.generate generate_params - - # expect(result[:output].size).to eq(SentenceChunk::CHUNK_SIZE_RANGE.size) - # end - # end - # end - # end - - # describe '::extract_generate_params' do - # let(:text_sample) do - # TextSample.create!(description: 'Stuff', text: 'another man') - # end - # let(:chunk_size) { 3 } - # let(:output_size) { 5 } - # let(:generate_params) do - # { chunk_size: chunk_size, - # output_size: output_size, - # text_sample_id: text_sample.id } - # end - - # it 'uses default chunk_size and output size if no params provided' do - # e_chunk_size, e_output_size = SentenceChunk.extract_generate_params - - # expect(e_chunk_size).to eq(Setting.chunk_size) - # expect(e_output_size).to eq(Setting.output_size) - # end - - # it 'extracts params' do - # e_chunk_size, e_output_size, e_text_sample_id = SentenceChunk - # .extract_generate_params generate_params - - # expect(e_chunk_size).to eq(chunk_size) - # expect(e_output_size).to eq(output_size) - # expect(e_text_sample_id).to eq(text_sample.id) - # end - # end - # let(:text_sample) do - # TextSample.create!(description: 'Stuff', text: 'another man') - # end - - # let(:chunk_size) { 3 } - # let(:output_size) { 5 } - # let(:sentence_chunk) { double('SentenceChunk') } - - # before(:each) do - # allow(SentenceChunk) - # .to receive(:choose_starting_chunk).and_return(sentence_chunk) - # allow(sentence_chunk) - # .to receive(:text).and_return('abc') - # allow(sentence_chunk) - # .to receive(:choose_next_chunk).and_return(sentence_chunk) - # end - - # it 'chooses a starting chunk' do - # SentenceChunk.generate_text(chunk_size, output_size, text_sample.id) - - # expect(SentenceChunk) - # .to(have_received(:choose_starting_chunk) - # .with(text_sample.id, chunk_size)) - # end - - # it 'generates the right number of extra tokens' do - # SentenceChunk.generate_text(chunk_size, output_size, text_sample.id) - - # expect(sentence_chunk) - # .to(have_received(:choose_next_chunk).twice) - # end - - # it 'returns the right length of output text' do - # result = SentenceChunk.generate_text(chunk_size, output_size, text_sample.id) - - # expect(result[:text].size).to eq(5) - # end - - # it 'returns a hash with the right keys' do - # result = SentenceChunk.generate_text(chunk_size, output_size, text_sample.id) - # expect(result).to have_key(:chunk_size) - # expect(result).to have_key(:text) - # end - # end - - # describe '::choose_starting_chunk' do - # let(:chunk_size) { 3 } - # let(:text_sample) do - # TextSample.create!(description: 'Stuff', text: 'mice') - # end - - # before(:each) do - # SentenceChunk.count_chunks(text_sample, chunk_size) - # end - - # it 'all SentenceChunks are potential candidates' do - # candidates = %w[mic ice] - - # # if we run this 100 times, it's pretty unlikely we won't get both of - # # these - # 100.times do - # candidate = SentenceChunk.choose_starting_chunk( - # text_sample.id, chunk_size - # ) - # candidates.delete(candidate.text) if candidates.include?(candidate.text) - # break if candidates.empty? - # end - # expect(candidates).to eq([]) - # end - - # it 'returns a SentenceChunk' do - # result = SentenceChunk.choose_starting_chunk(text_sample.id, chunk_size) - # expect(result).to be_instance_of(SentenceChunk) - # end - # end - - # describe '#choose_next_chunk' do - # let(:where_chain) { double('WhereChain') } - # let(:sentence_chunk) { create(:sentence_chunk) } - # let(:candidates) { double('candidates') } - - # before(:each) do - # allow(SentenceChunk).to receive(:choose_chunk_from_candidates) - # allow(SentenceChunk) - # .to receive(:where).and_return(where_chain) - - # allow(where_chain).to receive(:limit).and_return(candidates) - - # sentence_chunk.choose_next_chunk - # end - - # it 'finds candidate word chunks' do - # expect(SentenceChunk) - # .to(have_received(:where) - # .with('text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND text LIKE :chunk_head', - # { chunk_head: 't%', text_sample_id: sentence_chunk.text_sample_id, - # sentence_chunk_size: 2 })) - # end - - # it 'chooses word chunk from candidates' do - # expect(SentenceChunk) - # .to( - # have_received(:choose_chunk_from_candidates).with(candidates) - # ) - # end - # end - - # describe '::choose_chunk_from_candidates' do - # let(:counts_array) { [build(:sentence_chunk), build(:sentence_chunk)] } - # let(:candidates) { double('candidates') } - - # before(:each) do - # allow(SentenceChunk) - # .to receive(:build_counts_array).and_return(counts_array) - # end - - # it 'calculates probabilities of each word chunk' do - # SentenceChunk.choose_chunk_from_candidates(candidates) - - # expect(SentenceChunk) - # .to(have_received(:build_counts_array).with(candidates)) - # end - - # it 'selects a word chunk' do - # new_chunk = SentenceChunk.choose_chunk_from_candidates(candidates) - - # expect(new_chunk).to be_instance_of(SentenceChunk) - # end - # end - - # describe '::build_counts_array' do - # let!(:sentence_chunk_at) { create(:sentence_chunk, text: 'at', count: 2) } - # let!(:sentence_chunk_an) { create(:sentence_chunk, text: 'an', count: 1) } - # let(:candidates) { SentenceChunk.all } - - # it 'has the right number of elements' do - # counts_array = SentenceChunk.build_counts_array(candidates) - - # expect(counts_array.size).to eq(3) - # end - # end + describe '::generate' do # rubocop:disable Metrics/BlockLength + let(:text_sample) do + TextSample.create!(description: 'Stuff', text: 'another man') + end + let(:chunk_size) { 3 } + let(:token_size) { 5 } + + let(:generate_params) do + { chunk_size: chunk_size, + token_size: token_size, + text_sample_id: text_sample.id } + end + + it 'checks whether SentenceChunks have been generated for given TextSample' do + allow(SentenceChunk).to receive(:chunks_built_for?) + allow(SentenceChunk).to receive(:generate_text) + .and_return({ text: 'some generated text', + chunk_size: 'some size' }) + SentenceChunk.generate generate_params + expect(SentenceChunk).to have_received(:chunks_built_for?) + end + + context 'SentenceChunks have not been generated' do + it 'returns a warning' do + allow(SentenceChunk).to receive(:chunks_built_for?) + .with(anything) + .and_return(false) + result = SentenceChunk.generate generate_params + expect(result[:message]) + .not_to be(nil) + expect(result[:message]) + .to match(/Sentence chunks have not been built for this text sample/) + end + end + + context 'SentenceChunks have been generated' do # rubocop:disable Metrics/BlockLength + let(:generated_token_ids) { [1, 2, 3] } + let(:generated_tokens) { ['some', ' ', 'text'] } + let(:generated_text) { 'some text' } + before(:each) do + allow(SentenceChunk).to receive(:chunks_built_for?).and_return(true) + allow(SentenceChunk) + .to receive(:generate_text) + .and_return({ text: generated_text, chunk_size: chunk_size }) + allow(SentenceChunk) + .to receive(:extract_generate_params) + .and_return([chunk_size, token_size, text_sample.id]) + # allow(Token) + # .to receive(:replace_token_ids_with_tokens) + # .and_return(generated_tokens) + end + + it 'extracts generate parameters' do + SentenceChunk.generate generate_params + + expect(SentenceChunk) + .to have_received(:extract_generate_params) + end + + context 'for one chunk_size' do + let(:generation_result) do + { output: [{ text: generated_text, chunk_size: chunk_size }] } + end + + it 'generates the tokens' do + SentenceChunk.generate generate_params + expect(SentenceChunk) + .to have_received(:generate_text) + .with(chunk_size, token_size, text_sample.id) + end + + it 'returns a hash with the generated text' do + result = SentenceChunk.generate generate_params + + expect(result).to eq(generation_result) + end + end + + context 'for all chunk_sizes' do + let(:chunk_size) { 'all' } + let(:generate_params) do + { chunk_size: :chunk_size, + token_size: token_size, + text_sample_id: text_sample.id } + end + + before(:each) do + allow(SentenceChunk) + .to receive(:extract_generate_params) + .and_return([chunk_size, token_size, text_sample.id]) + end + + it 'generates the right number of texts' do + SentenceChunk.generate generate_params + expect(SentenceChunk) + .to have_received(:generate_text) + .exactly(SentenceChunk::CHUNK_SIZE_RANGE.size).times + end + + it 'returns a hash with the generated text' do + result = SentenceChunk.generate generate_params + + expect(result[:output].size).to eq(SentenceChunk::CHUNK_SIZE_RANGE.size) + end + end + end + end + + describe '::chunks_built_for?' do + it 'returns true if built' do + allow(SentenceChunk).to receive(:find_by).and_return 'something' + + expect(SentenceChunk.chunks_built_for?(-100)).to be true + end + + it 'returns false if not built' do + allow(SentenceChunk).to receive(:find_by).and_return nil + + expect(SentenceChunk.chunks_built_for?(-100)).to be false + end + end + + describe '::extract_generate_params' do + let(:text_sample) { TextSample.create!(description: 'Stuff', text: 'another man') } + let(:chunk_size) { 3 } + let(:token_size) { 5 } + let(:generate_params) do + { chunk_size: chunk_size, + token_size: token_size, + text_sample_id: text_sample.id } + end + + it 'uses default chunk_size and token_size if no params provided' do + e_chunk_size, e_token_size = SentenceChunk.extract_generate_params + + expect(e_chunk_size).to eq(Setting.chunk_size) + expect(e_token_size).to eq(Setting.token_size) + end + + it 'extracts params' do + e_chunk_size, e_token_size, e_text_sample_id = SentenceChunk + .extract_generate_params generate_params + + expect(e_chunk_size).to eq(chunk_size) + expect(e_token_size).to eq(token_size) + expect(e_text_sample_id).to eq(text_sample.id) + end + end + + describe '::generate_text' do # rubocop:disable Metrics/BlockLength + let(:text_sample) do + TextSample.create!(description: 'Stuff', text: 'another man') + end + + let(:chunk_size) { 3 } + let(:token_size) { 5 } + let(:sentence_chunk) { double('SentenceChunk') } + + before(:each) do + allow(SentenceChunk) + .to receive(:choose_starting_chunk).and_return(sentence_chunk) + allow(sentence_chunk) + .to receive(:token_ids).and_return([1, 2, 3]) + allow(sentence_chunk) + .to receive(:choose_next_chunk).and_return(sentence_chunk) + allow(Token) + .to receive(:replace_token_ids_with_tokens) + .and_return(['another', ' ', 'man']) + end + + it 'chooses a starting chunk' do + SentenceChunk.generate_text(chunk_size, token_size, text_sample.id) + + expect(SentenceChunk) + .to(have_received(:choose_starting_chunk) + .with(text_sample.id, chunk_size)) + end + + it 'generates the right number of extra tokens' do + SentenceChunk.generate_text(chunk_size, token_size, text_sample.id) + + expect(sentence_chunk) + .to(have_received(:choose_next_chunk).twice) + end + + it 'returns a hash with the right keys' do + result = SentenceChunk.generate_text(chunk_size, token_size, text_sample.id) + expect(result).to have_key(:chunk_size) + expect(result).to have_key(:text) + end + end + + describe '::choose_starting_chunk' do + let(:chunk_size) { 3 } + let(:text_sample) do + TextSample.create!(description: 'Stuff', text: 'take me to the river') + end + let(:text_sample_token_ids) { Token.id_ise(text_sample.text, :sentence) } + + before(:each) do + SentenceChunk.count_chunks(text_sample_token_ids, text_sample.id, chunk_size) + end + + it 'all SentenceChunks are potential candidates' do + candidates = ['take me', ' me ', 'me to', ' to ', 'to the', ' the ', 'the river'] + + # if we run this 100 times, it's pretty unlikely we won't get both of + # these chunks + 100.times do + candidate_token_ids = SentenceChunk.choose_starting_chunk( + text_sample.id, chunk_size + ).token_ids + candidate_text = Token.replace_token_ids_with_tokens(candidate_token_ids).join + candidates.delete(candidate_text) if candidates.include?(candidate_text) + break if candidates.empty? + end + expect(candidates).to eq([]) + end + + it 'returns a SentenceChunk' do + result = SentenceChunk.choose_starting_chunk(text_sample.id, chunk_size) + expect(result).to be_instance_of(SentenceChunk) + end + end + + describe '#choose_next_chunk' do + let(:where_chain) { double('WhereChain') } + let(:sentence_chunk) { create(:sentence_chunk) } + let(:candidates) { double('candidates') } + + before(:each) do + allow(SentenceChunk).to receive(:choose_chunk_from_candidates) + allow(SentenceChunk) + .to receive(:where).and_return(where_chain) + + allow(where_chain).to receive(:limit).and_return(candidates) + + sentence_chunk.choose_next_chunk + end + + it 'finds candidate sentence chunks' do + expect(SentenceChunk) + .to(have_received(:where) + .with('text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND token_ids[0] = 2', + { text_sample_id: sentence_chunk.text_sample_id, + sentence_chunk_size: 2 })) + end + + it 'chooses word chunk from candidates' do + expect(SentenceChunk) + .to( + have_received(:choose_chunk_from_candidates).with(candidates) + ) + end + end + + describe '::choose_chunk_from_candidates' do + let(:counts_array) { [build(:sentence_chunk), build(:sentence_chunk)] } + let(:candidates) { double('candidates') } + + before(:each) do + allow(SentenceChunk) + .to receive(:build_counts_array).and_return(counts_array) + end + + it 'calculates probabilities of each word chunk' do + SentenceChunk.choose_chunk_from_candidates(candidates) + + expect(SentenceChunk) + .to(have_received(:build_counts_array).with(candidates)) + end + + it 'selects a word chunk' do + new_chunk = SentenceChunk.choose_chunk_from_candidates(candidates) + + expect(new_chunk).to be_instance_of(SentenceChunk) + end + end + + describe '::build_counts_array' do + let!(:sentence_chunk_1_2) { create(:sentence_chunk, token_ids: [1, 2], count: 2) } # rubocop:disable Naming/VariableNumber + let!(:sentence_chunk_1_3) { create(:sentence_chunk, token_ids: [1, 3], count: 1) } # rubocop:disable Naming/VariableNumber + let(:candidates) { SentenceChunk.all } + + it 'has the right number of elements' do + counts_array = SentenceChunk.build_counts_array(candidates) + + expect(counts_array.size).to eq(3) + end + end + + describe '#to_tokens' do + it 'converts an array of token ids to the text of the tokens' do + text_sample = TextSample.create!(description: 'Stuff', text: 'take me to the river') + text_sample.analyse + + sentence_chunk = SentenceChunk.where("text_sample_id = #{text_sample.id}").first + expect(sentence_chunk.to_tokens).to eq(['take', ' ']) + end + end end