aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-07-21 21:50:02 +0200
committerAlex Auvolat <alex@adnab.me>2023-07-21 21:50:02 +0200
commit903cc6a3711d7b501371ee3ef55ae0f50d6cd63d (patch)
treec8c6660b60c56e80fee3a71aac71d149ef622510 /src
parente3822f6f9d45e62bd0198f6f2887408235d057a1 (diff)
downloaddatagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.tar.gz
datagengo-903cc6a3711d7b501371ee3ef55ae0f50d6cd63d.zip
Add dictionnary entries
Diffstat (limited to 'src')
-rw-r--r--src/main.rs99
1 files changed, 93 insertions, 6 deletions
diff --git a/src/main.rs b/src/main.rs
index 6f5ca4e..1572e2e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -61,13 +61,24 @@ fn main() {
fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save");
}
Cmd::Format => {
+ let jmdict = fs::read_to_string("data/JMdict_e.xml")
+ .expect("read_jmdict");
+ let jmdict = roxmltree::Document::parse_with_options(
+ &jmdict,
+ roxmltree::ParsingOptions {
+ allow_dtd: true,
+ ..Default::default()
+ })
+ .expect("parse_jmdict");
+ let jmdict_idx = index_jmdict(&jmdict);
+
let batches = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?))
.expect("read/parse");
- batches.par_iter()
+ batches.iter()
.enumerate()
- .for_each(|x| format_batch(batches.len(), x));
+ .for_each(|x| format_batch(&jmdict_idx, batches.len(), x));
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
format_index(&batches, &kanji_levels).expect("format_index");
@@ -75,6 +86,23 @@ fn main() {
}
}
+type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
+fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
+ let dict = dict.root().children().find(|x| x.has_tag_name("JMdict")).unwrap();
+
+ let mut ret: DictIndex<'a> = HashMap::new();
+ for x in dict.children().filter(|x| x.has_tag_name("entry")) {
+ for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
+ if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
+ let txt = keb.text().unwrap().trim();
+ ret.entry(txt).or_default().push(x);
+ }
+ }
+ }
+
+ ret
+}
+
fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> {
let file = fs::read_to_string("data/kanjidic2.xml")?;
let xml = roxmltree::Document::parse(&file)?;
@@ -279,11 +307,11 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
Ok(batch)
}
-fn format_batch(count: usize, (i, batch): (usize, &Batch)) {
- format_batch_aux(count, i, batch).expect("format batch");
+fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
+ format_batch_aux(dict_idx, count, i, batch).expect("format batch");
}
-fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
+fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create(format!("html/{:03}.html", i))?);
write!(f, r#"<!DOCTYPE html>
<html>
@@ -323,7 +351,38 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
}
}
writeln!(f, "</p>")?;
- writeln!(f, r#"<p class="en"><span class="expl">{}</span><br />{} </p>"#, ex.expl, ex.en)?;
+ writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?;
+
+ writeln!(f, r#"<details><summary>Explanation</summary>"#)?;
+ let mut expl_batch = Vec::new();
+ let mut expl_all = Vec::new();
+ for w in ex.expl.split(|c| c == ' ' || c == '~') {
+ let w = expl_clean_word(w);
+ let wchars = Charset::new(w);
+ if !wchars.intersects(&ex.chars) {
+ continue;
+ }
+ println!("{}", w);
+ if let Some(ents) = dict_idx.get(w) {
+ for ent in ents.iter() {
+ let s = dict_str(w, ent);
+ println!("{}: {}", w, s);
+ if wchars.intersects(&batch.chars) {
+ expl_batch.push(s);
+ } else {
+ expl_all.push(s);
+ }
+ }
+ }
+ }
+ for be in expl_batch {
+ writeln!(f, r#"<p>{}</p>"#, be)?;
+ }
+ writeln!(f, r#"<p class="chars">{}</p>"#, ex.chars.inter(&batch.chars).to_string())?;
+ for be in expl_all {
+ writeln!(f, r#"<p>{}</p>"#, be)?;
+ }
+ writeln!(f, r#"</details>"#)?;
}
write!(f, "</body></html>")?;
@@ -331,6 +390,34 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
Ok(())
}
+fn expl_clean_word(w: &str) -> &str {
+ let mut ret = w;
+ for delim in ['(', '{', '['] {
+ if let Some((s, _)) = ret.split_once(delim) {
+ ret = s;
+ }
+ }
+ ret
+}
+
+fn dict_str<'a>(w: &str, ent: &roxmltree::Node<'a, 'a>) -> String {
+ let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+ let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+
+ let mut ret = format!("{} [{}]", w, reb.text().unwrap().trim());
+
+ for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
+ if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
+ ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
+ }
+ }
+
+ if ret.chars().rev().next() == Some(';') {
+ ret.pop();
+ }
+ ret
+}
+
fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create("html/index.html")?);
write!(f, r#"<!DOCTYPE html>