Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
docs/build
Manifest.toml
.vscode/
4 changes: 2 additions & 2 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ module TextAnalysis
export Corpus, DirectoryCorpus
export stemmer_types, Stemmer
export DocumentTermMatrix
export text, tokens, ngrams
export text, tokens, ngrams, ordered_vocab
export text!, tokens!, ngrams!
export documents
export language, title, author, timestamp
Expand Down Expand Up @@ -112,4 +112,4 @@ module TextAnalysis
function __init__()

end
end
end
27 changes: 16 additions & 11 deletions src/coom.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,31 @@ of not the counts by the distance between word positions. The `mode` keyword can
julia> using TextAnalysis, DataStructures
doc = StringDocument("This is a text about an apple. There are many texts about apples.")
docv = TextAnalysis.tokenize(language(doc), text(doc))
vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
vocab = ordered_vocab(doc)
TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true)

3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
[2, 1] = 2.0
[1, 2] = 2.0
[3, 2] = 0.3999
[2, 3] = 0.3999
13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
1.0 2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅
⋮ ⋮ ⋮
⋅ ⋅ ⋅ ⋅ 2.0 ⋅ 0.4 1.166 0.6665 1.0 2.0 ⋅ 1.0
⋅ ⋅ ⋅ ⋅ 2.0 ⋅ ⋅ 2.0 0.4 0.5 0.6665 1.0 ⋅

julia> using TextAnalysis, DataStructures
doc = StringDocument("This is a text about an apple. There are many texts about apples.")
docv = TextAnalysis.tokenize(language(doc), text(doc))
vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
vocab = ordered_vocab(doc)
TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)

3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
[2, 1] = 1.0
[1, 2] = 1.0
[3, 2] = 0.1999
[2, 3] = 0.1999
13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries:
⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅
0.5 1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅
⋮ ⋮ ⋮
⋅ ⋅ ⋅ ⋅ 1.0 ⋅ 0.2 0.583 0.3333 0.5 1.0 ⋅ 0.5
⋅ ⋅ ⋅ ⋅ 1.0 ⋅ ⋅ 1.0 0.2 0.25 0.3333 0.5 ⋅
```
"""
function coo_matrix(::Type{T},
Expand Down
87 changes: 76 additions & 11 deletions src/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ end
#
##############################################################################

abstract type AbstractDocument; end
abstract type AbstractDocument end


mutable struct FileDocument <: AbstractDocument
Expand Down Expand Up @@ -142,7 +142,7 @@ A TokenDocument{String}
function TokenDocument(txt::AbstractString, dm::DocumentMetadata)
TokenDocument(tokenize(dm.language, String(txt)), dm)
end
function TokenDocument(tkns::Vector{T}) where T <: AbstractString
function TokenDocument(tkns::Vector{T}) where {T<:AbstractString}
TokenDocument(tkns, DocumentMetadata())
end
TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata())
Expand Down Expand Up @@ -189,7 +189,7 @@ end
function NGramDocument(txt::AbstractString, n::Integer...=1)
NGramDocument(txt, DocumentMetadata(), n...)
end
function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString
function NGramDocument(ng::Dict{T,Int}, n::Integer...=1) where {T<:AbstractString}
NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata())
end

Expand Down Expand Up @@ -270,17 +270,82 @@ julia> tokens(sd)
"."
```
"""
tokens(d::(Union{FileDocument, StringDocument})) = tokenize(language(d), text(d))
tokens(d::(Union{FileDocument,StringDocument})) = tokenize(language(d), text(d))
tokens(d::TokenDocument) = d.tokens
function tokens(d::NGramDocument)
error("The tokens of an NGramDocument cannot be reconstructed")
end

tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens)
function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString
tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T<:AbstractString} = (d.tokens = new_tokens)
function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where {T<:AbstractString}
error("The tokens of a $(typeof(d)) cannot be directly edited")
end


##############################################################################
#
# vocab() / vocab!(): Access to document text as a vocabulary
#
# to_string_vector(): Helper function for creating a vocabulary from a StringDocument or a Vector{String}
#
##############################################################################
# Converts a StringDocument to Vector{String}
to_string_vector(doc::StringDocument) = tokens(doc)
# Identity function for Vector{String}
to_string_vector(vec::Vector{String}) = vec

"""
ordered_vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int}

Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index.

# Arguments
- `input::Union{StringDocument, Vector{String}}`: Input can be either a `StringDocument` or a `Vector{String}`.
For `StringDocument`, the tokens are extracted and used. For `Vector{String}`, the vector itself is used.

# Returns
- `OrderedDict{String, Int}`: An ordered dictionary where each key is a unique string from the input,
and the value is the index of that string in the original input.

# Examples
```julia-repl
julia> doc = StringDocument("This is a sample sentence of a sample document.");
ordered_vocab(doc)

OrderedDict{String, Int64} with 8 entries:
"This" => 1
"is" => 2
"a" => 3
"sample" => 4
"sentence" => 5
⋮ => ⋮

julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"];
ordered_vocab(str_vec)

OrderedDict{String, Int64} with 7 entries:
"This" => 1
"is" => 2
"a" => 3
"sample" => 4
"sentence" => 5
⋮ => ⋮
"""
function ordered_vocab(input::Union{StringDocument,Vector{String}})
string_vector = to_string_vector(input) |> unique

# preallocating the ordered dictionary with the size of the string_vector
ordered_dict = OrderedDict{String,Int}()
sizehint!(ordered_dict, length(string_vector))

# populating the ordered dictionary
for (index, key) in enumerate(string_vector)
ordered_dict[key] = index
end
return ordered_dict
end


##############################################################################
#
# ngrams() / ngrams!(): Access to document text as n-gram counts
Expand Down Expand Up @@ -322,7 +387,7 @@ ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n.
ngrams(d::NGramDocument) = d.ngrams
ngrams(d::AbstractDocument) = ngrams(d, 1)

ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString, Int}) = (d.ngrams = new_ngrams)
ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString,Int}) = (d.ngrams = new_ngrams)
function ngrams!(d::AbstractDocument, new_ngrams::Dict)
error("The n-grams of $(typeof(d)) cannot be directly edited")
end
Expand Down Expand Up @@ -371,8 +436,8 @@ const GenericDocument = Union{
##############################################################################

Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str)
Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns)
Document(ng::Dict{String, Int}) = NGramDocument(ng)
Document(tkns::Vector{T}) where {T<:AbstractString} = TokenDocument(tkns)
Document(ng::Dict{String,Int}) = NGramDocument(ng)

##############################################################################
#
Expand All @@ -383,11 +448,11 @@ Document(ng::Dict{String, Int}) = NGramDocument(ng)
function Base.convert(::Type{StringDocument}, d::FileDocument)
StringDocument(text(d), d.metadata)
end
function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument, StringDocument}))
function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument,StringDocument}))
TokenDocument(tokens(d), d.metadata)
end
function Base.convert(::Type{NGramDocument},
d::(Union{FileDocument, StringDocument, TokenDocument}))
d::(Union{FileDocument,StringDocument,TokenDocument}))
NGramDocument(ngrams(d), 1, d.metadata)
end
Base.convert(::Type{TokenDocument}, d::TokenDocument) = d
Expand Down
22 changes: 13 additions & 9 deletions test/document.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
using DataStructures: OrderedDict

@testset "Document" begin

dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1=>"v1", :k2=>"v2"))
@test (dmeta.language == Languages.English()) &&
(dmeta.title == "test title") &&
(dmeta.author == "test author") &&
(dmeta.timestamp == "test time") &&
(get(dmeta.custom, :k1, "") == "v1") &&
(get(dmeta.custom, :k2, "") == "v2")
dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1 => "v1", :k2 => "v2"))
@test (dmeta.language == Languages.English()) &&
(dmeta.title == "test title") &&
(dmeta.author == "test author") &&
(dmeta.timestamp == "test time") &&
(get(dmeta.custom, :k1, "") == "v1") &&
(get(dmeta.custom, :k2, "") == "v2")

# mutability
dmeta.custom = nothing
Expand All @@ -34,6 +35,9 @@
@test "a" in keys(ngrams(sd, 1))
@test "string" in keys(ngrams(sd, 1))

@test ordered_vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)
@test ordered_vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4)

@test length(sd) == 16

hamlet_text = "To be or not to be..."
Expand Down Expand Up @@ -79,8 +83,8 @@
@test isequal(length(Document("this is text")), 12)

# NGramDocument creation with multiple ngram complexity
let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7)
for (n,c,l) in zip(N,C,L)
let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))), C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7)
for (n, c, l) in zip(N, C, L)
ngd = NGramDocument(sample_text1, n...)
@test ngram_complexity(ngd) == c
@test length(ngd.ngrams) == l
Expand Down