Add dictionnary entries

author: Alex Auvolat <alex@adnab.me> 2023-07-21 21:50:02 +0200
committer: Alex Auvolat <alex@adnab.me> 2023-07-21 21:50:02 +0200
commit: 903cc6a3711d7b501371ee3ef55ae0f50d6cd63d (patch)
tree: c8c6660b60c56e80fee3a71aac71d149ef622510
parent: e3822f6f9d45e62bd0198f6f2887408235d057a1 (diff)
download: datagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.tar.gz
datagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.zip
2 files changed, 97 insertions, 14 deletions
diff --git a/html/style.css b/html/style.css
index e312f06..7fdbc5e 100644
--- a/html/style.css
+++ b/html/style.css
@@ -12,7 +12,7 @@ td {
 
 .ja {
     text-align: center;
-    font-size: 2em;
+    font-size: 2rem;
 }
 
 .ja:hover .char_cur {
@@ -33,13 +33,9 @@ td {
 
 .en {
     text-align: center;
-    font-size: 1.2em;
+    font-size: 1.2rem;
 }
 
-.en .expl {
-    color: transparent;
-}
-
-.en:hover .expl {
-    color: black;
+details .chars {
+    font-size: 3rem;
 }
diff --git a/src/main.rs b/src/main.rs
index 6f5ca4e..1572e2e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -61,13 +61,24 @@ fn main() {
             fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save");
         }
         Cmd::Format => {
+            let jmdict = fs::read_to_string("data/JMdict_e.xml")
+                .expect("read_jmdict");
+            let jmdict = roxmltree::Document::parse_with_options(
+                &jmdict,
+                roxmltree::ParsingOptions {
+                    allow_dtd: true,
+                    ..Default::default()
+                })
+                .expect("parse_jmdict");
+            let jmdict_idx = index_jmdict(&jmdict);
+
             let batches = fs::read("data/batches.json")
                 .map_err(anyhow::Error::from)
                 .and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?))
                 .expect("read/parse");
-            batches.par_iter()
+            batches.iter()
                 .enumerate()
-                .for_each(|x| format_batch(batches.len(), x));
+                .for_each(|x| format_batch(&jmdict_idx, batches.len(), x));
 
             let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
             format_index(&batches, &kanji_levels).expect("format_index");
@@ -75,6 +86,23 @@ fn main() {
     }
 }
 
+type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
+fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
+    let dict = dict.root().children().find(|x| x.has_tag_name("JMdict")).unwrap();
+
+    let mut ret: DictIndex<'a> = HashMap::new();
+    for x in dict.children().filter(|x| x.has_tag_name("entry")) {
+        for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
+            if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
+                let txt = keb.text().unwrap().trim();
+                ret.entry(txt).or_default().push(x);
+            }
+        }
+    }
+
+    ret
+}
+
 fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> {
     let file = fs::read_to_string("data/kanjidic2.xml")?;
     let xml = roxmltree::Document::parse(&file)?;
@@ -279,11 +307,11 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
     Ok(batch)
 }
 
-fn format_batch(count: usize, (i, batch): (usize, &Batch)) {
-    format_batch_aux(count, i, batch).expect("format batch");
+fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
+    format_batch_aux(dict_idx, count, i, batch).expect("format batch");
 }
 
-fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
+fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch) -> Result<()> {
     let mut f = io::BufWriter::new(fs::File::create(format!("html/{:03}.html", i))?);
     write!(f, r#"<!DOCTYPE html>
         <html>
@@ -323,7 +351,38 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
             }
         }
         writeln!(f, "</p>")?;
-        writeln!(f, r#"<p class="en"><span class="expl">{}</span><br />{} </p>"#, ex.expl, ex.en)?;
+        writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?;
+
+        writeln!(f, r#"<details><summary>Explanation</summary>"#)?;
+        let mut expl_batch = Vec::new();
+        let mut expl_all = Vec::new();
+        for w in ex.expl.split(|c| c == ' ' || c == '~') {
+            let w = expl_clean_word(w);
+            let wchars = Charset::new(w);
+            if !wchars.intersects(&ex.chars) {
+                continue;
+            }
+            println!("{}", w);
+            if let Some(ents) = dict_idx.get(w) {
+                for ent in ents.iter() {
+                    let s = dict_str(w, ent);
+                    println!("{}: {}", w, s);
+                    if wchars.intersects(&batch.chars) {
+                        expl_batch.push(s);
+                    } else {
+                        expl_all.push(s);
+                    }
+                }
+            }
+        }
+        for be in expl_batch {
+            writeln!(f, r#"<p>{}</p>"#, be)?;
+        }
+        writeln!(f, r#"<p class="chars">{}</p>"#, ex.chars.inter(&batch.chars).to_string())?;
+        for be in expl_all {
+            writeln!(f, r#"<p>{}</p>"#, be)?;
+        }
+        writeln!(f, r#"</details>"#)?;
     }
 
     write!(f, "</body></html>")?;
@@ -331,6 +390,34 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
     Ok(())
 }
 
+fn expl_clean_word(w: &str) -> &str {
+    let mut ret = w;
+    for delim in ['(', '{', '['] {
+        if let Some((s, _)) = ret.split_once(delim) {
+            ret = s;
+        }
+    }
+    ret
+}
+
+fn dict_str<'a>(w: &str, ent: &roxmltree::Node<'a, 'a>) -> String {
+    let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+    let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+
+    let mut ret = format!("{} [{}]", w, reb.text().unwrap().trim());
+
+    for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
+        if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
+            ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
+        }
+    }
+
+    if ret.chars().rev().next() == Some(';') {
+        ret.pop();
+    }
+    ret
+}
+
 fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
     let mut f = io::BufWriter::new(fs::File::create("html/index.html")?);
     write!(f, r#"<!DOCTYPE html>
author	Alex Auvolat <alex@adnab.me>	2023-07-21 21:50:02 +0200
committer	Alex Auvolat <alex@adnab.me>	2023-07-21 21:50:02 +0200
commit	903cc6a3711d7b501371ee3ef55ae0f50d6cd63d (patch)
tree	c8c6660b60c56e80fee3a71aac71d149ef622510
parent	e3822f6f9d45e62bd0198f6f2887408235d057a1 (diff)
download	datagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.tar.gz datagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.zip