Skip to content

Commit

Permalink
Publish kanpyo
Browse files Browse the repository at this point in the history
  • Loading branch information
togatoga committed Jun 23, 2024
1 parent c4846a2 commit 709f5c6
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 30 deletions.
13 changes: 8 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
name = "kanpyo"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
exclude = ["kanpyo-dict/"]
description = "Japanese Morphological Analyzer"
keywords = ["japanese", "morphological", "analyzer"]
categories = ["text-processing", "natural-language-processing"]
license = "MIT"

[dependencies]
clap = { version = "4.5.7", features = ["derive"] }
kanpyo-dict = { path = "kanpyo-dict" }

dirs = "5.0.1"
kanpyo-dict = { version = "0.1.0", path = "kanpyo-dict" }

[workspace]
members = [
Expand All @@ -17,4 +20,4 @@ members = [

[[bin]]
name = "kanpyo"
path = "src/bin/kanpyo.rs"
path = "src/bin/kanpyo.rs"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Kanpyo is Japanese morphological analyzer written in Rust inspired by [Kagome](h

## Caution

This is a work in progress and not yet ready for use.
This is a work in progress. I would break the API without notice.

### TODO

Expand Down
4 changes: 4 additions & 0 deletions kanpyo-dict/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
name = "kanpyo-dict"
version = "0.1.0"
edition = "2021"
exclude = ["resource/"]
description = "Dictionary Library for Kanpyo"
license = "MIT"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[[bin]]
Expand All @@ -13,6 +16,7 @@ anyhow = "1.0.83"
bincode = "1.3.3"
clap = { version = "4.4.6", features = ["derive"] }
csv = "1.3.0"
dirs = "5.0.1"
encoding_rs = "0.8.33"
itertools = "0.13.0"
regex = "1.10.5"
Expand Down
16 changes: 14 additions & 2 deletions kanpyo-dict/src/bin/ipa_dict_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,23 @@ enum Encoding {
Utf8,
}

fn get_default_output_path() -> String {
let mut path = dirs::config_dir().expect("failed to get config dir").join("kanpyo");
path.push("ipa.dict");
path.into_os_string()
.into_string()
.expect("failed to convert path to string")
}

#[derive(Parser)]
#[command(name = "IPAdic builder", about = "Builds an ipa.dict", version = "0.1", long_about=None)]
struct IPADictBuilderCommand {
/// Path of input dict, e.g. mecab-ipadic-2.7.0-20070801
#[arg(short, long, default_value = "resource/mecab-ipadic-2.7.0-20070801")]
dict: PathBuf,
/// Path of output dict, e.g. ipa.dict
#[arg(short, long, default_value = "ipa.dict")]
out: PathBuf,
#[arg(short, long, default_value_t = get_default_output_path())]
out: String,
// Encoding of input dict
#[arg(short, long, default_value = "euc-jp")]
encoding: Encoding,
Expand All @@ -34,8 +42,12 @@ impl IPADictBuilderCommand {
let config = Config::new(&self.dict, encoding);
let dict = builder::build(&config);

let path = PathBuf::from(&self.out);
std::fs::create_dir_all(path.parent().expect("failed to get parent dir"))
.expect("failed to create dir");
let mut output = std::fs::File::create(&self.out).expect("failed to create file");
dict.build(&mut output).expect("failed to build dict");
println!("Built ipa.dict to {}", self.out)
}
}

Expand Down
Empty file removed kanpyo-dict/src/content.rs
Empty file.
88 changes: 66 additions & 22 deletions src/bin/kanpyo.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use clap::{Parser, Subcommand};
use clap::{Parser, Subcommand, ValueEnum};
use kanpyo::{lattice::node::BOS_EOS_ID, tokenizer::Tokenzier};
use kanpyo_dict::dict;
use std::path::PathBuf;
Expand All @@ -18,18 +18,24 @@ enum SubCommand {
/// Input text to analyze [default: stdin]
#[arg(index = 1)]
input: Option<String>,
/// Dictionary file
#[arg(short, long, default_value = "kanpyo-dict/ipa.dict")]
dict: PathBuf,
/// Dictionary
#[arg(short, long, value_enum, default_value = "ipa")]
dict: Dict,
/// Custom dictionary
#[arg(short, long)]
custom_dict: Option<PathBuf>,
},
/// Output lattice in Graphviz format
Graphviz {
/// Input text to analyze
#[arg(index = 1)]
input: String,
/// Dictionary file
#[arg(short, long, default_value = "kanpyo-dict/ipa.dict")]
dict: PathBuf,
/// Dictionary
#[arg(short, long, value_enum, default_value = "ipa")]
dict: Dict,
/// Custom dictionary
#[arg(short, long)]
custom_dict: Option<PathBuf>,
/// Output full state of lattice
#[arg(short, long, default_value = "false")]
full_state: bool,
Expand All @@ -39,49 +45,87 @@ enum SubCommand {
},
}

#[derive(Debug, Clone, ValueEnum)]
enum Dict {
Ipa,
// Unidic,
}

fn get_dict_path(dict: Dict) -> PathBuf {
let mut path = dirs::config_dir()
.expect("failed to get config dir")
.join("kanpyo");
match dict {
Dict::Ipa => {
path.push("ipa.dict");
} // Dict::Unidic => {
// path.push("unidic.dict");
// }
}
path
}

impl KanpyoCommand {
fn tokenize(input: Option<String>, dict: PathBuf) {
let mut reader =
std::io::BufReader::new(std::fs::File::open(dict).expect("failed to open dict"));
let tokenzier = Tokenzier::new(dict::Dict::load(&mut reader).expect("failed to load dict"));
fn tokenizer(dict: Dict, custom_dict: Option<PathBuf>) -> Tokenzier {
let dict_file = custom_dict.unwrap_or_else(|| get_dict_path(dict));
let mut reader = std::io::BufReader::new(
std::fs::File::open(dict_file).expect("failed to open custom dict"),
);
Tokenzier::new(dict::Dict::load(&mut reader).expect("failed to load dict"))
}

fn tokenize(input: Option<String>, dict: Dict, custom_dict: Option<PathBuf>) {
let tokenizer = KanpyoCommand::tokenizer(dict, custom_dict);
loop {
match &input {
Some(text) => {
print_tokens(tokenzier.tokenize(text), &tokenzier.dict);
print_tokens(tokenizer.tokenize(text), &tokenizer.dict);
break;
}
None => {
let mut buf = String::new();
std::io::stdin()
.read_line(&mut buf)
.expect("failed to read from stdin");
print_tokens(tokenzier.tokenize(buf.trim_end()), &tokenzier.dict);
if buf.is_empty() {
break;
}
print_tokens(tokenizer.tokenize(buf.trim_end()), &tokenizer.dict);
}
};
}
}
fn graphviz(input: String, dict: PathBuf, dpi: usize, full_state: bool) {
let mut reader =
std::io::BufReader::new(std::fs::File::open(dict).expect("failed to open dict"));
let tokenzier = Tokenzier::new(dict::Dict::load(&mut reader).expect("failed to load dict"));
fn graphviz(
input: String,
dict: Dict,
custom_dict: Option<PathBuf>,
dpi: usize,
full_state: bool,
) {
let tokenzier = KanpyoCommand::tokenizer(dict, custom_dict);
let lattice = kanpyo::lattice::Lattice::build(&tokenzier.dict, &input);
kanpyo::graphviz::Graphviz { lattice }.graphviz(dpi, full_state);
}
fn run(self) {
match self.subcommand {
Some(SubCommand::Tokenize { input, dict }) => {
KanpyoCommand::tokenize(input, dict);
Some(SubCommand::Tokenize {
input,
dict,
custom_dict,
}) => {
KanpyoCommand::tokenize(input, dict, custom_dict);
}
Some(SubCommand::Graphviz {
input,
dict,
custom_dict,
dpi,
full_state
full_state,
}) => {
KanpyoCommand::graphviz(input, dict, dpi, full_state);
KanpyoCommand::graphviz(input, dict, custom_dict, dpi, full_state);
}
None => {
KanpyoCommand::tokenize(None, PathBuf::from("dict/ipa.dict"));
KanpyoCommand::tokenize(None, Dict::Ipa, None);
}
}
}
Expand Down

0 comments on commit 709f5c6

Please sign in to comment.