From 903cc6a3711d7b501371ee3ef55ae0f50d6cd63d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 21 Jul 2023 21:50:02 +0200 Subject: Add dictionnary entries --- html/style.css | 12 +++---- src/main.rs | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 97 insertions(+), 14 deletions(-) diff --git a/html/style.css b/html/style.css index e312f06..7fdbc5e 100644 --- a/html/style.css +++ b/html/style.css @@ -12,7 +12,7 @@ td { .ja { text-align: center; - font-size: 2em; + font-size: 2rem; } .ja:hover .char_cur { @@ -33,13 +33,9 @@ td { .en { text-align: center; - font-size: 1.2em; + font-size: 1.2rem; } -.en .expl { - color: transparent; -} - -.en:hover .expl { - color: black; +details .chars { + font-size: 3rem; } diff --git a/src/main.rs b/src/main.rs index 6f5ca4e..1572e2e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -61,13 +61,24 @@ fn main() { fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save"); } Cmd::Format => { + let jmdict = fs::read_to_string("data/JMdict_e.xml") + .expect("read_jmdict"); + let jmdict = roxmltree::Document::parse_with_options( + &jmdict, + roxmltree::ParsingOptions { + allow_dtd: true, + ..Default::default() + }) + .expect("parse_jmdict"); + let jmdict_idx = index_jmdict(&jmdict); + let batches = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice::>(&x)?)) .expect("read/parse"); - batches.par_iter() + batches.iter() .enumerate() - .for_each(|x| format_batch(batches.len(), x)); + .for_each(|x| format_batch(&jmdict_idx, batches.len(), x)); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); format_index(&batches, &kanji_levels).expect("format_index"); @@ -75,6 +86,23 @@ fn main() { } } +type DictIndex<'a> = HashMap<&'a str, Vec>>; +fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { + let dict = dict.root().children().find(|x| x.has_tag_name("JMdict")).unwrap(); + + let mut ret: DictIndex<'a> = HashMap::new(); + for x in dict.children().filter(|x| x.has_tag_name("entry")) { + for r in x.children().filter(|x| x.has_tag_name("k_ele")) { + if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { + let txt = keb.text().unwrap().trim(); + ret.entry(txt).or_default().push(x); + } + } + } + + ret +} + fn parse_kanjidic() -> Result> { let file = fs::read_to_string("data/kanjidic2.xml")?; let xml = roxmltree::Document::parse(&file)?; @@ -279,11 +307,11 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & Ok(batch) } -fn format_batch(count: usize, (i, batch): (usize, &Batch)) { - format_batch_aux(count, i, batch).expect("format batch"); +fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { + format_batch_aux(dict_idx, count, i, batch).expect("format batch"); } -fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { +fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create(format!("html/{:03}.html", i))?); write!(f, r#" @@ -323,7 +351,38 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { } } writeln!(f, "

")?; - writeln!(f, r#"

{}
{}

"#, ex.expl, ex.en)?; + writeln!(f, r#"

{}

"#, ex.en)?; + + writeln!(f, r#"
Explanation"#)?; + let mut expl_batch = Vec::new(); + let mut expl_all = Vec::new(); + for w in ex.expl.split(|c| c == ' ' || c == '~') { + let w = expl_clean_word(w); + let wchars = Charset::new(w); + if !wchars.intersects(&ex.chars) { + continue; + } + println!("{}", w); + if let Some(ents) = dict_idx.get(w) { + for ent in ents.iter() { + let s = dict_str(w, ent); + println!("{}: {}", w, s); + if wchars.intersects(&batch.chars) { + expl_batch.push(s); + } else { + expl_all.push(s); + } + } + } + } + for be in expl_batch { + writeln!(f, r#"

{}

"#, be)?; + } + writeln!(f, r#"

{}

"#, ex.chars.inter(&batch.chars).to_string())?; + for be in expl_all { + writeln!(f, r#"

{}

"#, be)?; + } + writeln!(f, r#"
"#)?; } write!(f, "")?; @@ -331,6 +390,34 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { Ok(()) } +fn expl_clean_word(w: &str) -> &str { + let mut ret = w; + for delim in ['(', '{', '['] { + if let Some((s, _)) = ret.split_once(delim) { + ret = s; + } + } + ret +} + +fn dict_str<'a>(w: &str, ent: &roxmltree::Node<'a, 'a>) -> String { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + + let mut ret = format!("{} [{}]", w, reb.text().unwrap().trim()); + + for sense in ent.children().filter(|x| x.has_tag_name("sense")) { + if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { + ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); + } + } + + if ret.chars().rev().next() == Some(';') { + ret.pop(); + } + ret +} + fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create("html/index.html")?); write!(f, r#" -- cgit v1.2.3