Skip to content

Commit

Permalink
- generate method for SentenceChunk
Browse files Browse the repository at this point in the history
- and methods that support it
- add token_size to settings
- convert token_ids column from serialisable to array
- all tests
  • Loading branch information
charlesdeb committed Apr 20, 2021
1 parent 94f70e7 commit 177c88b
Show file tree
Hide file tree
Showing 7 changed files with 452 additions and 283 deletions.
118 changes: 114 additions & 4 deletions app/models/sentence_chunk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

# for a Sentence Chunk, a 'chunk' is an ordered collection of words, spaces and
# punctuation (all called tokens)
class SentenceChunk < ApplicationRecord
class SentenceChunk < ApplicationRecord # rubocop:disable Metrics/ClassLength
belongs_to :text_sample

validates :token_ids, presence: true # may normalise this later, or convert to a serializable column type
validates :token_ids, presence: true # may normalise this later
validates :size, presence: true
validates :count, presence: true

serialize(:token_ids, Array)
# serialize(:token_ids, Array)

CHUNK_SIZE_RANGE = (2..8).freeze

Expand Down Expand Up @@ -66,7 +66,7 @@ def self.save_chunks(chunks_hash, text_sample_id, chunk_size, save_strategy = :i
end
end

def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
chunks_hash, text_sample_id, chunk_size
)
current_time = DateTime.now
Expand All @@ -84,4 +84,114 @@ def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
end
SentenceChunk.insert_all! import_array
end

# Entry point for generating text using the sentence chunk strategy
#
# @param [Hash] params parameters to generate with
# @option [Integer] chunk_size chunk size to use for generation
# @option [Integer] token_size number of tokens to generate
# @option [Integer] text_sample_id TextSample to use as the model
def self.generate(params = {}) # rubocop:disable Metrics/MethodLength
unless chunks_built_for? params[:text_sample_id]
return { message: 'Sentence chunks have not been built for this text sample' }
end

chunk_size, token_size, text_sample_id = extract_generate_params(params)

output = []

if chunk_size == 'all'
CHUNK_SIZE_RANGE.each do |current_chunk_size|
output.push(generate_text(current_chunk_size, token_size, text_sample_id))
end
else
output.push(generate_text(chunk_size, token_size, text_sample_id))
end

{ output: output }
end

# Helper method that pulls individual parameters out of params or sets
# reasonable defaults
# @param (see ::generate)
def self.extract_generate_params(params = {})
chunk_size =
if params[:chunk_size]
.to_i.zero?
Setting.chunk_size
else params[:chunk_size].to_i
end

token_size = if params[:token_size]
.to_i.zero?
Setting.token_size else params[:token_size].to_i end

[chunk_size, token_size, params[:text_sample_id]]
end

def self.chunks_built_for?(text_sample_id)
!SentenceChunk.find_by(text_sample_id: text_sample_id).nil?
end

def self.generate_text(chunk_size, token_size, text_sample_id)
chunk = choose_starting_chunk(text_sample_id, chunk_size)

output_token_ids = chunk.token_ids
while output_token_ids.size < token_size
chunk = chunk.choose_next_chunk
next_token_id = chunk.token_ids[-1]
output_token_ids << next_token_id
end

output = Token.replace_token_ids_with_tokens(output_token_ids).join

{ text: output, chunk_size: chunk_size }
end

def self.choose_starting_chunk(text_sample_id, chunk_size)
candidates = SentenceChunk
.where({ text_sample_id: text_sample_id, size: chunk_size })
.limit(nil)
candidates[(rand * candidates.size).to_i]
end

# Choose the next word chunk after this one
def choose_next_chunk
token_ids_where = []

# grab all but the first token in the chunk
token_ids[1..].map.with_index do |token_id, index|
# and build a where clause so that all the tokens in the array match.
# Note: PostgreSQL arrays are 1-indexed and not 0-indexed
token_ids_where << "token_ids[#{index + 1}] = #{token_id}"
end
token_ids_where = token_ids_where.join(' AND ')

candidates = SentenceChunk
.where("text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND #{token_ids_where}",
text_sample_id: text_sample.id, sentence_chunk_size: size)
.limit(nil)

SentenceChunk.choose_chunk_from_candidates(candidates)
end

def self.choose_chunk_from_candidates(candidates)
counts_array = SentenceChunk.build_counts_array(candidates)

counts_array[(rand * counts_array.size).to_i]
end

def self.build_counts_array(candidates)
counts_array = []
candidates.each do |chunk|
chunk.count.times { counts_array.push(chunk) }
end
counts_array
end

# helper method for converting an array of token_ids back to an array of
# readable text
def to_tokens
Token.replace_token_ids_with_tokens(token_ids)
end
end
1 change: 1 addition & 0 deletions app/models/setting.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Setting < RailsSettings::Base
field :generate_strategy, type: :string, default: 'word_chunk'
field :chunk_size, type: :string, default: 'all'
field :output_size, type: :integer, default: 250
field :token_size, type: :integer, default: 250
field :prior_word_count, type: :string, default: 'all'

# field :host, type: :string, default: "http://localhost:3000"
Expand Down
4 changes: 3 additions & 1 deletion app/models/word_chunk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def self.save_word_chunks(
end
end

def self.save_word_chunks_by_insert_all(chunks_hash, text_sample, chunk_size)
def self.save_word_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
chunks_hash, text_sample, chunk_size
)
current_time = DateTime.now
import_array = []
chunks_hash.each do |chunk_text, count|
Expand Down
12 changes: 12 additions & 0 deletions db/migrate/20210417225204_convert_token_ids_to_array.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This will destroy any data in the token_ids column of the sentence_chunks table
class ConvertTokenIdsToArray < ActiveRecord::Migration[6.0]
def up
remove_column(:sentence_chunks, :token_ids)
add_column(:sentence_chunks, :token_ids, :integer, array: true)
end

def down
remove_column(:sentence_chunks, :token_ids)
add_column(:sentence_chunks, :token_ids, :text)
end
end
5 changes: 2 additions & 3 deletions db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2021_04_14_221302) do
ActiveRecord::Schema.define(version: 2021_04_17_225204) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand All @@ -19,12 +19,11 @@
t.integer "size", null: false
t.integer "count", null: false
t.bigint "text_sample_id", null: false
t.string "token_ids"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.integer "token_ids", array: true
t.index ["size"], name: "index_sentence_chunks_on_size"
t.index ["text_sample_id"], name: "index_sentence_chunks_on_text_sample_id"
t.index ["token_ids"], name: "index_sentence_chunks_on_token_ids"
end

create_table "settings", force: :cascade do |t|
Expand Down
11 changes: 11 additions & 0 deletions spec/factories/sentence_chunk.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# frozen_string_literal: true

FactoryBot.define do
factory :sentence_chunk, class: SentenceChunk do
size { 2 }
token_ids { [1, 2] }
count { 1 }
# text_sample_two_chars
association :text_sample, factory: :text_sample_two_chars
end
end
Loading

0 comments on commit 177c88b

Please sign in to comment.