aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2024-04-03 20:08:35 +0200
committerAlex Auvolat <alex@adnab.me>2024-04-03 20:08:35 +0200
commit62e6641203f6af2c7ad90cace7fff045f867218e (patch)
tree0df15f38a2e4be2ef55a1765c0413b23a4840b73
parent0fde35d584a4ff19db60e632ed0896848934659d (diff)
downloaddatagengo-62e6641203f6af2c7ad90cace7fff045f867218e.tar.gz
datagengo-62e6641203f6af2c7ad90cace7fff045f867218e.zip
preprocess jmdict & add logging
-rw-r--r--.gitignore1
-rw-r--r--Cargo.lock60
-rw-r--r--Cargo.toml3
-rw-r--r--src/datafiles.rs12
-rw-r--r--src/example.rs21
-rw-r--r--src/main.rs33
-rw-r--r--src/server.rs91
7 files changed, 149 insertions, 72 deletions
diff --git a/.gitignore b/.gitignore
index d77b1f4..f1d74b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,5 +3,6 @@
kanjidic*.xml
JMdict*.xml
examples.utf
+jmdict_idx.json
public
diff --git a/Cargo.lock b/Cargo.lock
index 613b503..624cc3e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -684,7 +684,9 @@ dependencies = [
"fasthash",
"futures",
"http-types",
+ "log",
"markdown",
+ "pretty_env_logger",
"rand 0.8.5",
"rayon",
"regex",
@@ -717,6 +719,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
[[package]]
+name = "env_logger"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580"
+dependencies = [
+ "humantime",
+ "is-terminal",
+ "log",
+ "regex",
+ "termcolor",
+]
+
+[[package]]
name = "erased-serde"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1134,6 +1149,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
+[[package]]
name = "iana-time-zone"
version = "0.1.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1193,6 +1214,17 @@ dependencies = [
]
[[package]]
+name = "is-terminal"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+dependencies = [
+ "hermit-abi 0.3.2",
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
name = "itoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1422,6 +1454,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
+name = "pretty_env_logger"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "865724d4dbe39d9f3dd3b52b88d859d66bcb2d6a0acfd5ea68a65fb66d4bdc1c"
+dependencies = [
+ "env_logger",
+ "log",
+]
+
+[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2026,6 +2068,15 @@ dependencies = [
]
[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2368,6 +2419,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
+name = "winapi-util"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index db5359f..326054f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,9 @@ rayon = "1.7"
regex = "1.0"
roxmltree = "0.18"
+log = "0.4"
+pretty_env_logger = "0.5.0"
+
tide = "0.16.0"
futures = "0.3"
async-std = { version = "1.6.0", features = ["attributes"] }
diff --git a/src/datafiles.rs b/src/datafiles.rs
index fc6194f..d4f948d 100644
--- a/src/datafiles.rs
+++ b/src/datafiles.rs
@@ -23,9 +23,10 @@ pub struct Example {
// PARSING DATA FILES
// =====================================================================
+#[derive(Serialize, Deserialize)]
pub struct DictEntry {
pub reb: String,
- pub ent_seq: String,
+ pub ent_seq: u64,
pub sense: Box<[String]>,
}
@@ -44,7 +45,7 @@ pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex {
let reb = reb.text().unwrap().trim().to_string();
let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
- let ent_seq = ent_seq.text().unwrap().trim().to_string();
+ let ent_seq = ent_seq.text().unwrap().trim().parse().unwrap();
let sense = ent
.children()
@@ -70,6 +71,11 @@ pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex {
ret
}
+pub fn read_jmdict_idx() -> Result<DictIndex> {
+ let file = fs::read("data/jmdict_idx.json")?;
+ Ok(serde_json::from_slice::<DictIndex>(&file)?)
+}
+
pub fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());
@@ -196,7 +202,7 @@ pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
}
}
if i % 10000 == 0 {
- eprintln!("read examples: {}/300 (x1000)", i / 1000);
+ info!("read examples: {}/300 (x1000)", i / 1000);
}
}
diff --git a/src/example.rs b/src/example.rs
index c52cc8f..494ab73 100644
--- a/src/example.rs
+++ b/src/example.rs
@@ -4,16 +4,17 @@ use crate::charset::Charset;
use crate::*;
impl Example {
- pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap<String, String>) {
+ pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap<String, String>) -> bool {
use std::fmt::Write;
if let Some(v) = overrides.get(&self.ja) {
self.furigana = Some(v.to_string());
- return;
+ return true;
}
let mut remainder = self.ja.as_str();
let mut ret = String::new();
+ let mut is_good = true;
for word in self.expl.split(|c| c == ' ' || c == '~') {
let (keb, reb) = expl_clean_word(word);
@@ -34,7 +35,8 @@ impl Example {
remainder = remainder.strip_prefix(c).unwrap();
new_word.push(c);
} else {
- eprintln!("!!!! Char {} is not in remainder !!!!", c);
+ is_good = false;
+ warn!("!!!! Char {} is not in remainder !!!!", c);
}
}
let word = &new_word;
@@ -49,11 +51,12 @@ impl Example {
let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
if let Some(ent) = ents
.iter()
- .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap())
+ .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap().parse::<u64>().unwrap())
{
ent.reb.as_str()
} else {
- println!("- entry id not found: {}", reb);
+ is_good = false;
+ warn!("- entry id not found: {}", reb);
ret += &word;
continue;
}
@@ -68,7 +71,8 @@ impl Example {
if matches.len() == 1 {
*matches.iter().next().unwrap()
} else {
- println!("- word without reb: {}", word);
+ is_good = false;
+ warn!("- word with {} reb: {}", matches.len(), word);
ret += &word;
continue;
}
@@ -170,10 +174,13 @@ impl Example {
let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap();
let back_to_ja = re.replace_all(&ret, "").replace("[[", "");
if self.ja != back_to_ja {
- eprintln!("!!!! {} != {}", self.ja, back_to_ja);
+ is_good = false;
+ error!("!!!! {} != {}", self.ja, back_to_ja);
}
self.furigana = Some(ret);
+
+ is_good
}
pub fn furigana_markup(&self) -> String {
diff --git a/src/main.rs b/src/main.rs
index b59669d..c09d045 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,6 @@
+#[macro_use]
+extern crate log;
+
use std::collections::HashMap;
use std::fs;
use std::io;
@@ -32,6 +35,7 @@ struct Opt {
enum Cmd {
ParseKanjidic,
ParseJlptVocab,
+ IndexJmdict,
New {
#[structopt(default_value = "10")]
count: usize,
@@ -47,6 +51,11 @@ enum Cmd {
#[async_std::main]
async fn main() {
+ if std::env::var("RUST_LOG").is_err() {
+ std::env::set_var("RUST_LOG", "datagengo=info")
+ }
+ pretty_env_logger::init();
+
let opt = Opt::from_args();
match opt.cmd {
@@ -62,6 +71,19 @@ async fn main() {
Charset::from_iter(kanji_levels.iter().map(|(_, c)| c.chars()).flatten());
parse_jlpt_vocab(&all_kanji).expect("error");
}
+ Cmd::IndexJmdict => {
+ let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
+ let jmdict = roxmltree::Document::parse_with_options(
+ &jmdict,
+ roxmltree::ParsingOptions {
+ allow_dtd: true,
+ ..Default::default()
+ },
+ )
+ .expect("parse_jmdict");
+ let jmdict_idx = index_jmdict(&jmdict);
+ fs::write("data/jmdict_idx.json", serde_json::to_string_pretty(&jmdict_idx).expect("to_json").as_bytes()).expect("write");
+ }
Cmd::New { truncate, count } => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(
@@ -119,16 +141,7 @@ async fn main() {
save_batches(batches).expect("save_batches");
}
Cmd::Format => {
- let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
- let jmdict = roxmltree::Document::parse_with_options(
- &jmdict,
- roxmltree::ParsingOptions {
- allow_dtd: true,
- ..Default::default()
- },
- )
- .expect("parse_jmdict");
- let jmdict_idx = index_jmdict(&jmdict);
+ let jmdict_idx = read_jmdict_idx().expect("read jmdict_idx.json");
let batches = read_batches().expect("read/parse");
diff --git a/src/server.rs b/src/server.rs
index 76911f6..51191f1 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -1,5 +1,3 @@
-use std::fs;
-
use anyhow::{anyhow, Result};
use futures::stream::TryStreamExt;
use rand::prelude::*;
@@ -16,7 +14,7 @@ use crate::*;
pub async fn server_main() -> tide::Result<()> {
// ---- load data files ----
- eprintln!("Loading kanji levels...");
+ info!("Loading kanji levels...");
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(
kanji_levels
@@ -26,41 +24,23 @@ pub async fn server_main() -> tide::Result<()> {
.join(""),
);
- eprintln!("Loading examples...");
+ info!("Loading examples...");
let mut examples = read_examples(&all_kanji).expect("read_examples");
examples.retain(|e| (5..=25).contains(&e.ja.chars().count()));
let examples = Box::leak(examples.into_boxed_slice());
- eprintln!("Counting chars in examples...");
+ info!("Counting chars in examples...");
let example_freq = calc_example_freq(&examples);
- eprintln!("Loading furigana overrides...");
+ info!("Loading furigana overrides...");
let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");
- eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
-
- eprintln!("Loading JMdict_e.xml...");
- let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
- eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
-
- eprintln!("Parsing JMdict_e.xml...");
- let jmdict_xml = roxmltree::Document::parse_with_options(
- &jmdict_raw,
- roxmltree::ParsingOptions {
- allow_dtd: true,
- ..Default::default()
- },
- )
- .expect("parse_jmdict");
- eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
-
- eprintln!("Indexing JMdict_e.xml...");
- let jmdict_idx = index_jmdict(&jmdict_xml);
- eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
- drop(jmdict_xml);
- drop(jmdict_raw);
- eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
-
- eprintln!("Loading batches.json...");
+ debug!("RAM: {}", ALLOCATOR.allocated() / 1024);
+
+ info!("Loading jmdict_idx.json...");
+ let jmdict_idx = read_jmdict_idx().expect("read jmdict_idx.json");
+ debug!("RAM: {}", ALLOCATOR.allocated() / 1024);
+
+ info!("Loading batches.json...");
let batches = read_batches().expect("read/parse");
let batches = Box::leak(batches.into_boxed_slice());
@@ -93,7 +73,8 @@ pub async fn server_main() -> tide::Result<()> {
// ---- serve actual http ----
- eprintln!("Server listening on 127.0.0.1:8080");
+ info!("Server listening on 127.0.0.1:8080");
+ debug!("RAM: {}", ALLOCATOR.allocated() / 1024);
app.listen("127.0.0.1:8080").await?;
Ok(())
@@ -222,9 +203,7 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result {
)
.into_bytes()))?;
- gen_examples(state, &allowed_chars, &needed_chars, 50, |mut ex| {
- ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides);
-
+ gen_examples(state, &allowed_chars, &needed_chars, 50, |ex| {
let mut expl = "<table>".to_string();
for word in ex.expl.split(|c| c == ' ' || c == '~') {
let (keb, reb) = expl_clean_word(word);
@@ -370,7 +349,7 @@ where
let mut remaining_needed = needed_chars.clone();
let mut have_chars = Charset::new("");
- println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars");
+ trace!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars");
while generated < count {
let mut selection = None;
let mut total_weight = 0f64;
@@ -393,22 +372,30 @@ where
if let Some((i, f)) = selection {
let (ex, _) = candidates.remove(i);
- remaining_needed = remaining_needed.diff(&ex.chars);
- have_chars = have_chars.union(&ex.chars);
-
- generated += 1;
- println!(
- "{}\t{}\t{}\t{}\t{}\t{}\t{}",
- generated,
- f,
- have_chars.len(),
- remaining_needed.len(),
- allowed_chars.len(),
- counted,
- ex.chars.to_string()
- );
-
- callback(ex.clone())?;
+
+ let mut ex = ex.clone();
+ if ex.gen_furigana(&data.jmdict_idx, &data.furigana_overrides) {
+ remaining_needed = remaining_needed.diff(&ex.chars);
+ have_chars = have_chars.union(&ex.chars);
+ generated += 1;
+
+ trace!(
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}",
+ generated,
+ f,
+ have_chars.len(),
+ remaining_needed.len(),
+ allowed_chars.len(),
+ counted,
+ ex.chars.to_string()
+ );
+
+ callback(ex)?;
+ } else {
+ warn!("Warning: failed to generate furigana");
+ warn!(" sentence: {}", ex.ja);
+ warn!(" bad furi: {}", ex.furigana.as_deref().unwrap_or("-"));
+ }
} else {
break;
}