diff options
author | Alex Auvolat <alex@adnab.me> | 2024-04-03 20:08:35 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2024-04-03 20:08:35 +0200 |
commit | 62e6641203f6af2c7ad90cace7fff045f867218e (patch) | |
tree | 0df15f38a2e4be2ef55a1765c0413b23a4840b73 /src | |
parent | 0fde35d584a4ff19db60e632ed0896848934659d (diff) | |
download | datagengo-62e6641203f6af2c7ad90cace7fff045f867218e.tar.gz datagengo-62e6641203f6af2c7ad90cace7fff045f867218e.zip |
preprocess jmdict & add logging
Diffstat (limited to 'src')
-rw-r--r-- | src/datafiles.rs | 12 | ||||
-rw-r--r-- | src/example.rs | 21 | ||||
-rw-r--r-- | src/main.rs | 33 | ||||
-rw-r--r-- | src/server.rs | 91 |
4 files changed, 85 insertions, 72 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs index fc6194f..d4f948d 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -23,9 +23,10 @@ pub struct Example { // PARSING DATA FILES // ===================================================================== +#[derive(Serialize, Deserialize)] pub struct DictEntry { pub reb: String, - pub ent_seq: String, + pub ent_seq: u64, pub sense: Box<[String]>, } @@ -44,7 +45,7 @@ pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex { let reb = reb.text().unwrap().trim().to_string(); let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); - let ent_seq = ent_seq.text().unwrap().trim().to_string(); + let ent_seq = ent_seq.text().unwrap().trim().parse().unwrap(); let sense = ent .children() @@ -70,6 +71,11 @@ pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex { ret } +pub fn read_jmdict_idx() -> Result<DictIndex> { + let file = fs::read("data/jmdict_idx.json")?; + Ok(serde_json::from_slice::<DictIndex>(&file)?) +} + pub fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); @@ -196,7 +202,7 @@ pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { } } if i % 10000 == 0 { - eprintln!("read examples: {}/300 (x1000)", i / 1000); + info!("read examples: {}/300 (x1000)", i / 1000); } } diff --git a/src/example.rs b/src/example.rs index c52cc8f..494ab73 100644 --- a/src/example.rs +++ b/src/example.rs @@ -4,16 +4,17 @@ use crate::charset::Charset; use crate::*; impl Example { - pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap<String, String>) { + pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap<String, String>) -> bool { use std::fmt::Write; if let Some(v) = overrides.get(&self.ja) { self.furigana = Some(v.to_string()); - return; + return true; } let mut remainder = self.ja.as_str(); let mut ret = String::new(); + let mut is_good = true; for word in self.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); @@ -34,7 +35,8 @@ impl Example { remainder = remainder.strip_prefix(c).unwrap(); new_word.push(c); } else { - eprintln!("!!!! Char {} is not in remainder !!!!", c); + is_good = false; + warn!("!!!! Char {} is not in remainder !!!!", c); } } let word = &new_word; @@ -49,11 +51,12 @@ impl Example { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); if let Some(ent) = ents .iter() - .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap()) + .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap().parse::<u64>().unwrap()) { ent.reb.as_str() } else { - println!("- entry id not found: {}", reb); + is_good = false; + warn!("- entry id not found: {}", reb); ret += &word; continue; } @@ -68,7 +71,8 @@ impl Example { if matches.len() == 1 { *matches.iter().next().unwrap() } else { - println!("- word without reb: {}", word); + is_good = false; + warn!("- word with {} reb: {}", matches.len(), word); ret += &word; continue; } @@ -170,10 +174,13 @@ impl Example { let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); if self.ja != back_to_ja { - eprintln!("!!!! {} != {}", self.ja, back_to_ja); + is_good = false; + error!("!!!! {} != {}", self.ja, back_to_ja); } self.furigana = Some(ret); + + is_good } pub fn furigana_markup(&self) -> String { diff --git a/src/main.rs b/src/main.rs index b59669d..c09d045 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ +#[macro_use] +extern crate log; + use std::collections::HashMap; use std::fs; use std::io; @@ -32,6 +35,7 @@ struct Opt { enum Cmd { ParseKanjidic, ParseJlptVocab, + IndexJmdict, New { #[structopt(default_value = "10")] count: usize, @@ -47,6 +51,11 @@ enum Cmd { #[async_std::main] async fn main() { + if std::env::var("RUST_LOG").is_err() { + std::env::set_var("RUST_LOG", "datagengo=info") + } + pretty_env_logger::init(); + let opt = Opt::from_args(); match opt.cmd { @@ -62,6 +71,19 @@ async fn main() { Charset::from_iter(kanji_levels.iter().map(|(_, c)| c.chars()).flatten()); parse_jlpt_vocab(&all_kanji).expect("error"); } + Cmd::IndexJmdict => { + let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); + let jmdict = roxmltree::Document::parse_with_options( + &jmdict, + roxmltree::ParsingOptions { + allow_dtd: true, + ..Default::default() + }, + ) + .expect("parse_jmdict"); + let jmdict_idx = index_jmdict(&jmdict); + fs::write("data/jmdict_idx.json", serde_json::to_string_pretty(&jmdict_idx).expect("to_json").as_bytes()).expect("write"); + } Cmd::New { truncate, count } => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( @@ -119,16 +141,7 @@ async fn main() { save_batches(batches).expect("save_batches"); } Cmd::Format => { - let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - let jmdict = roxmltree::Document::parse_with_options( - &jmdict, - roxmltree::ParsingOptions { - allow_dtd: true, - ..Default::default() - }, - ) - .expect("parse_jmdict"); - let jmdict_idx = index_jmdict(&jmdict); + let jmdict_idx = read_jmdict_idx().expect("read jmdict_idx.json"); let batches = read_batches().expect("read/parse"); diff --git a/src/server.rs b/src/server.rs index 76911f6..51191f1 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,5 +1,3 @@ -use std::fs; - use anyhow::{anyhow, Result}; use futures::stream::TryStreamExt; use rand::prelude::*; @@ -16,7 +14,7 @@ use crate::*; pub async fn server_main() -> tide::Result<()> { // ---- load data files ---- - eprintln!("Loading kanji levels..."); + info!("Loading kanji levels..."); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( kanji_levels @@ -26,41 +24,23 @@ pub async fn server_main() -> tide::Result<()> { .join(""), ); - eprintln!("Loading examples..."); + info!("Loading examples..."); let mut examples = read_examples(&all_kanji).expect("read_examples"); examples.retain(|e| (5..=25).contains(&e.ja.chars().count())); let examples = Box::leak(examples.into_boxed_slice()); - eprintln!("Counting chars in examples..."); + info!("Counting chars in examples..."); let example_freq = calc_example_freq(&examples); - eprintln!("Loading furigana overrides..."); + info!("Loading furigana overrides..."); let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides"); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Loading JMdict_e.xml..."); - let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Parsing JMdict_e.xml..."); - let jmdict_xml = roxmltree::Document::parse_with_options( - &jmdict_raw, - roxmltree::ParsingOptions { - allow_dtd: true, - ..Default::default() - }, - ) - .expect("parse_jmdict"); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Indexing JMdict_e.xml..."); - let jmdict_idx = index_jmdict(&jmdict_xml); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - drop(jmdict_xml); - drop(jmdict_raw); - eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); - - eprintln!("Loading batches.json..."); + debug!("RAM: {}", ALLOCATOR.allocated() / 1024); + + info!("Loading jmdict_idx.json..."); + let jmdict_idx = read_jmdict_idx().expect("read jmdict_idx.json"); + debug!("RAM: {}", ALLOCATOR.allocated() / 1024); + + info!("Loading batches.json..."); let batches = read_batches().expect("read/parse"); let batches = Box::leak(batches.into_boxed_slice()); @@ -93,7 +73,8 @@ pub async fn server_main() -> tide::Result<()> { // ---- serve actual http ---- - eprintln!("Server listening on 127.0.0.1:8080"); + info!("Server listening on 127.0.0.1:8080"); + debug!("RAM: {}", ALLOCATOR.allocated() / 1024); app.listen("127.0.0.1:8080").await?; Ok(()) @@ -222,9 +203,7 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result { ) .into_bytes()))?; - gen_examples(state, &allowed_chars, &needed_chars, 50, |mut ex| { - ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides); - + gen_examples(state, &allowed_chars, &needed_chars, 50, |ex| { let mut expl = "<table>".to_string(); for word in ex.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); @@ -370,7 +349,7 @@ where let mut remaining_needed = needed_chars.clone(); let mut have_chars = Charset::new(""); - println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars"); + trace!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars"); while generated < count { let mut selection = None; let mut total_weight = 0f64; @@ -393,22 +372,30 @@ where if let Some((i, f)) = selection { let (ex, _) = candidates.remove(i); - remaining_needed = remaining_needed.diff(&ex.chars); - have_chars = have_chars.union(&ex.chars); - - generated += 1; - println!( - "{}\t{}\t{}\t{}\t{}\t{}\t{}", - generated, - f, - have_chars.len(), - remaining_needed.len(), - allowed_chars.len(), - counted, - ex.chars.to_string() - ); - - callback(ex.clone())?; + + let mut ex = ex.clone(); + if ex.gen_furigana(&data.jmdict_idx, &data.furigana_overrides) { + remaining_needed = remaining_needed.diff(&ex.chars); + have_chars = have_chars.union(&ex.chars); + generated += 1; + + trace!( + "{}\t{}\t{}\t{}\t{}\t{}\t{}", + generated, + f, + have_chars.len(), + remaining_needed.len(), + allowed_chars.len(), + counted, + ex.chars.to_string() + ); + + callback(ex)?; + } else { + warn!("Warning: failed to generate furigana"); + warn!(" sentence: {}", ex.ja); + warn!(" bad furi: {}", ex.furigana.as_deref().unwrap_or("-")); + } } else { break; } |