From 62e6641203f6af2c7ad90cace7fff045f867218e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 3 Apr 2024 20:08:35 +0200 Subject: preprocess jmdict & add logging --- .gitignore | 1 + Cargo.lock | 60 +++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 ++ src/datafiles.rs | 12 ++++++-- src/example.rs | 21 ++++++++----- src/main.rs | 33 +++++++++++++------- src/server.rs | 91 ++++++++++++++++++++++++-------------------------------- 7 files changed, 149 insertions(+), 72 deletions(-) diff --git a/.gitignore b/.gitignore index d77b1f4..f1d74b4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,6 @@ kanjidic*.xml JMdict*.xml examples.utf +jmdict_idx.json public diff --git a/Cargo.lock b/Cargo.lock index 613b503..624cc3e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -684,7 +684,9 @@ dependencies = [ "fasthash", "futures", "http-types", + "log", "markdown", + "pretty_env_logger", "rand 0.8.5", "rayon", "regex", @@ -716,6 +718,19 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +[[package]] +name = "env_logger" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + [[package]] name = "erased-serde" version = "0.4.4" @@ -1133,6 +1148,12 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "iana-time-zone" version = "0.1.60" @@ -1192,6 +1213,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi 0.3.2", + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "itoa" version = "1.0.9" @@ -1421,6 +1453,16 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "pretty_env_logger" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "865724d4dbe39d9f3dd3b52b88d859d66bcb2d6a0acfd5ea68a65fb66d4bdc1c" +dependencies = [ + "env_logger", + "log", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -2025,6 +2067,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -2367,6 +2418,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index db5359f..326054f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,9 @@ rayon = "1.7" regex = "1.0" roxmltree = "0.18" +log = "0.4" +pretty_env_logger = "0.5.0" + tide = "0.16.0" futures = "0.3" async-std = { version = "1.6.0", features = ["attributes"] } diff --git a/src/datafiles.rs b/src/datafiles.rs index fc6194f..d4f948d 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -23,9 +23,10 @@ pub struct Example { // PARSING DATA FILES // ===================================================================== +#[derive(Serialize, Deserialize)] pub struct DictEntry { pub reb: String, - pub ent_seq: String, + pub ent_seq: u64, pub sense: Box<[String]>, } @@ -44,7 +45,7 @@ pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex { let reb = reb.text().unwrap().trim().to_string(); let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); - let ent_seq = ent_seq.text().unwrap().trim().to_string(); + let ent_seq = ent_seq.text().unwrap().trim().parse().unwrap(); let sense = ent .children() @@ -70,6 +71,11 @@ pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex { ret } +pub fn read_jmdict_idx() -> Result { + let file = fs::read("data/jmdict_idx.json")?; + Ok(serde_json::from_slice::(&file)?) +} + pub fn parse_kanjidic() -> Result> { let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); @@ -196,7 +202,7 @@ pub fn read_examples(all_kanji: &Charset) -> Result> { } } if i % 10000 == 0 { - eprintln!("read examples: {}/300 (x1000)", i / 1000); + info!("read examples: {}/300 (x1000)", i / 1000); } } diff --git a/src/example.rs b/src/example.rs index c52cc8f..494ab73 100644 --- a/src/example.rs +++ b/src/example.rs @@ -4,16 +4,17 @@ use crate::charset::Charset; use crate::*; impl Example { - pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap) { + pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap) -> bool { use std::fmt::Write; if let Some(v) = overrides.get(&self.ja) { self.furigana = Some(v.to_string()); - return; + return true; } let mut remainder = self.ja.as_str(); let mut ret = String::new(); + let mut is_good = true; for word in self.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); @@ -34,7 +35,8 @@ impl Example { remainder = remainder.strip_prefix(c).unwrap(); new_word.push(c); } else { - eprintln!("!!!! Char {} is not in remainder !!!!", c); + is_good = false; + warn!("!!!! Char {} is not in remainder !!!!", c); } } let word = &new_word; @@ -49,11 +51,12 @@ impl Example { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); if let Some(ent) = ents .iter() - .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap()) + .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap().parse::().unwrap()) { ent.reb.as_str() } else { - println!("- entry id not found: {}", reb); + is_good = false; + warn!("- entry id not found: {}", reb); ret += &word; continue; } @@ -68,7 +71,8 @@ impl Example { if matches.len() == 1 { *matches.iter().next().unwrap() } else { - println!("- word without reb: {}", word); + is_good = false; + warn!("- word with {} reb: {}", matches.len(), word); ret += &word; continue; } @@ -170,10 +174,13 @@ impl Example { let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); if self.ja != back_to_ja { - eprintln!("!!!! {} != {}", self.ja, back_to_ja); + is_good = false; + error!("!!!! {} != {}", self.ja, back_to_ja); } self.furigana = Some(ret); + + is_good } pub fn furigana_markup(&self) -> String { diff --git a/src/main.rs b/src/main.rs index b59669d..c09d045 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ +#[macro_use] +extern crate log; + use std::collections::HashMap; use std::fs; use std::io; @@ -32,6 +35,7 @@ struct Opt { enum Cmd { ParseKanjidic, ParseJlptVocab, + IndexJmdict, New { #[structopt(default_value = "10")] count: usize, @@ -47,6 +51,11 @@ enum Cmd { #[async_std::main] async fn main() { + if std::env::var("RUST_LOG").is_err() { + std::env::set_var("RUST_LOG", "datagengo=info") + } + pretty_env_logger::init(); + let opt = Opt::from_args(); match opt.cmd { @@ -62,6 +71,19 @@ async fn main() { Charset::from_iter(kanji_levels.iter().map(|(_, c)| c.chars()).flatten()); parse_jlpt_vocab(&all_kanji).expect("error"); } + Cmd::IndexJmdict => { + let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); + let jmdict = roxmltree::Document::parse_with_options( + &jmdict, + roxmltree::ParsingOptions { + allow_dtd: true, + ..Default::default() + }, + ) + .expect("parse_jmdict"); + let jmdict_idx = index_jmdict(&jmdict); + fs::write("data/jmdict_idx.json", serde_json::to_string_pretty(&jmdict_idx).expect("to_json").as_bytes()).expect("write"); + } Cmd::New { truncate, count } => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( @@ -119,16 +141,7 @@ async fn main() { save_batches(batches).expect("save_batches"); } Cmd::Format => { - let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - let jmdict = roxmltree::Document::parse_with_options( - &jmdict, - roxmltree::ParsingOptions { - allow_dtd: true, - ..Default::default() - }, - ) - .expect("parse_jmdict"); - let jmdict_idx = index_jmdict(&jmdict); + let jmdict_idx = read_jmdict_idx().expect("read jmdict_idx.json"); let batches = read_batches().expect("read/parse"); diff --git a/src/server.rs b/src/server.rs index 76911f6..51191f1 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,5 +1,3 @@ -use std::fs; - use anyhow::{anyhow, Result}; use futures::stream::TryStreamExt; use rand::prelude::*; @@ -16,7 +14,7 @@ use crate::*; pub async fn server_main() -> tide::Result<()> { // ---- load data files ---- - eprintln!("Loading kanji levels..."); + info!("Loading kanji levels..."); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( kanji_levels @@ -26,41 +24,23 @@ pub async fn server_main() -> tide::Result<()> { .join(""), ); - eprintln!("Loading examples..."); + info!("Loading examples..."); let mut examples = read_examples(&all_kanji).expect("read_examples"); examples.retain(|e| (5..=25).contains(&e.ja.chars().count())); let examples = Box::leak(examples.into_boxed_slice()); - eprintln!("Counting chars in examples..."); + info!("Counting chars in examples..."); let example_freq = calc_example_freq(&examples); - eprintln!("Loading furigana overrides..."); + info!("Loading furigana overrides..."); let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides"); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Loading JMdict_e.xml..."); - let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Parsing JMdict_e.xml..."); - let jmdict_xml = roxmltree::Document::parse_with_options( - &jmdict_raw, - roxmltree::ParsingOptions { - allow_dtd: true, - ..Default::default() - }, - ) - .expect("parse_jmdict"); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Indexing JMdict_e.xml..."); - let jmdict_idx = index_jmdict(&jmdict_xml); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - drop(jmdict_xml); - drop(jmdict_raw); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Loading batches.json..."); + debug!("RAM: {}", ALLOCATOR.allocated() / 1024); + + info!("Loading jmdict_idx.json..."); + let jmdict_idx = read_jmdict_idx().expect("read jmdict_idx.json"); + debug!("RAM: {}", ALLOCATOR.allocated() / 1024); + + info!("Loading batches.json..."); let batches = read_batches().expect("read/parse"); let batches = Box::leak(batches.into_boxed_slice()); @@ -93,7 +73,8 @@ pub async fn server_main() -> tide::Result<()> { // ---- serve actual http ---- - eprintln!("Server listening on 127.0.0.1:8080"); + info!("Server listening on 127.0.0.1:8080"); + debug!("RAM: {}", ALLOCATOR.allocated() / 1024); app.listen("127.0.0.1:8080").await?; Ok(()) @@ -222,9 +203,7 @@ async fn gen_examples_page(mut req: Request) -> tide::Result { ) .into_bytes()))?; - gen_examples(state, &allowed_chars, &needed_chars, 50, |mut ex| { - ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides); - + gen_examples(state, &allowed_chars, &needed_chars, 50, |ex| { let mut expl = "".to_string(); for word in ex.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); @@ -370,7 +349,7 @@ where let mut remaining_needed = needed_chars.clone(); let mut have_chars = Charset::new(""); - println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars"); + trace!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars"); while generated < count { let mut selection = None; let mut total_weight = 0f64; @@ -393,22 +372,30 @@ where if let Some((i, f)) = selection { let (ex, _) = candidates.remove(i); - remaining_needed = remaining_needed.diff(&ex.chars); - have_chars = have_chars.union(&ex.chars); - - generated += 1; - println!( - "{}\t{}\t{}\t{}\t{}\t{}\t{}", - generated, - f, - have_chars.len(), - remaining_needed.len(), - allowed_chars.len(), - counted, - ex.chars.to_string() - ); - - callback(ex.clone())?; + + let mut ex = ex.clone(); + if ex.gen_furigana(&data.jmdict_idx, &data.furigana_overrides) { + remaining_needed = remaining_needed.diff(&ex.chars); + have_chars = have_chars.union(&ex.chars); + generated += 1; + + trace!( + "{}\t{}\t{}\t{}\t{}\t{}\t{}", + generated, + f, + have_chars.len(), + remaining_needed.len(), + allowed_chars.len(), + counted, + ex.chars.to_string() + ); + + callback(ex)?; + } else { + warn!("Warning: failed to generate furigana"); + warn!(" sentence: {}", ex.ja); + warn!(" bad furi: {}", ex.furigana.as_deref().unwrap_or("-")); + } } else { break; } -- cgit v1.2.3