diff options
author | Alex Auvolat <alex@adnab.me> | 2023-07-21 21:50:02 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-07-21 21:50:02 +0200 |
commit | 903cc6a3711d7b501371ee3ef55ae0f50d6cd63d (patch) | |
tree | c8c6660b60c56e80fee3a71aac71d149ef622510 | |
parent | e3822f6f9d45e62bd0198f6f2887408235d057a1 (diff) | |
download | datagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.tar.gz datagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.zip |
Add dictionnary entries
-rw-r--r-- | html/style.css | 12 | ||||
-rw-r--r-- | src/main.rs | 99 |
2 files changed, 97 insertions, 14 deletions
diff --git a/html/style.css b/html/style.css index e312f06..7fdbc5e 100644 --- a/html/style.css +++ b/html/style.css @@ -12,7 +12,7 @@ td { .ja { text-align: center; - font-size: 2em; + font-size: 2rem; } .ja:hover .char_cur { @@ -33,13 +33,9 @@ td { .en { text-align: center; - font-size: 1.2em; + font-size: 1.2rem; } -.en .expl { - color: transparent; -} - -.en:hover .expl { - color: black; +details .chars { + font-size: 3rem; } diff --git a/src/main.rs b/src/main.rs index 6f5ca4e..1572e2e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -61,13 +61,24 @@ fn main() { fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save"); } Cmd::Format => { + let jmdict = fs::read_to_string("data/JMdict_e.xml") + .expect("read_jmdict"); + let jmdict = roxmltree::Document::parse_with_options( + &jmdict, + roxmltree::ParsingOptions { + allow_dtd: true, + ..Default::default() + }) + .expect("parse_jmdict"); + let jmdict_idx = index_jmdict(&jmdict); + let batches = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?)) .expect("read/parse"); - batches.par_iter() + batches.iter() .enumerate() - .for_each(|x| format_batch(batches.len(), x)); + .for_each(|x| format_batch(&jmdict_idx, batches.len(), x)); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); format_index(&batches, &kanji_levels).expect("format_index"); @@ -75,6 +86,23 @@ fn main() { } } +type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>; +fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { + let dict = dict.root().children().find(|x| x.has_tag_name("JMdict")).unwrap(); + + let mut ret: DictIndex<'a> = HashMap::new(); + for x in dict.children().filter(|x| x.has_tag_name("entry")) { + for r in x.children().filter(|x| x.has_tag_name("k_ele")) { + if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { + let txt = keb.text().unwrap().trim(); + ret.entry(txt).or_default().push(x); + } + } + } + + ret +} + fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> { let file = fs::read_to_string("data/kanjidic2.xml")?; let xml = roxmltree::Document::parse(&file)?; @@ -279,11 +307,11 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & Ok(batch) } -fn format_batch(count: usize, (i, batch): (usize, &Batch)) { - format_batch_aux(count, i, batch).expect("format batch"); +fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { + format_batch_aux(dict_idx, count, i, batch).expect("format batch"); } -fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { +fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create(format!("html/{:03}.html", i))?); write!(f, r#"<!DOCTYPE html> <html> @@ -323,7 +351,38 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { } } writeln!(f, "</p>")?; - writeln!(f, r#"<p class="en"><span class="expl">{}</span><br />{} </p>"#, ex.expl, ex.en)?; + writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?; + + writeln!(f, r#"<details><summary>Explanation</summary>"#)?; + let mut expl_batch = Vec::new(); + let mut expl_all = Vec::new(); + for w in ex.expl.split(|c| c == ' ' || c == '~') { + let w = expl_clean_word(w); + let wchars = Charset::new(w); + if !wchars.intersects(&ex.chars) { + continue; + } + println!("{}", w); + if let Some(ents) = dict_idx.get(w) { + for ent in ents.iter() { + let s = dict_str(w, ent); + println!("{}: {}", w, s); + if wchars.intersects(&batch.chars) { + expl_batch.push(s); + } else { + expl_all.push(s); + } + } + } + } + for be in expl_batch { + writeln!(f, r#"<p>{}</p>"#, be)?; + } + writeln!(f, r#"<p class="chars">{}</p>"#, ex.chars.inter(&batch.chars).to_string())?; + for be in expl_all { + writeln!(f, r#"<p>{}</p>"#, be)?; + } + writeln!(f, r#"</details>"#)?; } write!(f, "</body></html>")?; @@ -331,6 +390,34 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { Ok(()) } +fn expl_clean_word(w: &str) -> &str { + let mut ret = w; + for delim in ['(', '{', '['] { + if let Some((s, _)) = ret.split_once(delim) { + ret = s; + } + } + ret +} + +fn dict_str<'a>(w: &str, ent: &roxmltree::Node<'a, 'a>) -> String { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + + let mut ret = format!("{} [{}]", w, reb.text().unwrap().trim()); + + for sense in ent.children().filter(|x| x.has_tag_name("sense")) { + if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { + ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); + } + } + + if ret.chars().rev().next() == Some(';') { + ret.pop(); + } + ret +} + fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create("html/index.html")?); write!(f, r#"<!DOCTYPE html> |