diff options
author | Alex Auvolat <alex@adnab.me> | 2023-11-28 16:22:16 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-11-28 16:22:16 +0100 |
commit | b78034ad5bf65f1dfe390861f72bed827e2ab1b8 (patch) | |
tree | aa42b4edc8d2a44bb5c41890089cb754ca7d40d7 /src | |
parent | 64568528b13d08ceaa0c36c20b3aa20d966cfdcb (diff) | |
download | datagengo-b78034ad5bf65f1dfe390861f72bed827e2ab1b8.tar.gz datagengo-b78034ad5bf65f1dfe390861f72bed827e2ab1b8.zip |
add furigana to main examples and persist furigana in batches.json
Diffstat (limited to 'src')
-rw-r--r-- | src/datafiles.rs | 4 | ||||
-rw-r--r-- | src/example.rs | 203 | ||||
-rw-r--r-- | src/format.rs | 233 | ||||
-rw-r--r-- | src/main.rs | 120 |
4 files changed, 293 insertions, 267 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs index 629badf..0e526ef 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -12,6 +12,8 @@ pub struct Example { pub ja: String, pub en: String, pub expl: String, + #[serde(default)] + pub furigana: Option<String>, pub id: Option<String>, pub chars: Charset, } @@ -151,6 +153,7 @@ pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { expl: b.to_string(), id: Some(id.to_string()), chars: Charset::new(ja).inter(all_kanji), + furigana: None, }); } else { ret.push(Example { @@ -159,6 +162,7 @@ pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { expl: b.to_string(), id: None, chars: Charset::new(ja).inter(all_kanji), + furigana: None, }); } } diff --git a/src/example.rs b/src/example.rs new file mode 100644 index 0000000..71f3f13 --- /dev/null +++ b/src/example.rs @@ -0,0 +1,203 @@ +use std::collections::HashSet; + +use crate::charset::Charset; +use crate::*; + +impl Example { + pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>) { + use std::fmt::Write; + + let mut remainder = self.ja.as_str(); + let mut ret = String::new(); + + for word in self.expl.split(|c| c == ' ' || c == '~') { + let (keb, reb) = expl_clean_word(word); + let word = word + .split_once('{') + .and_then(|(_, r)| r.split_once('}')) + .map(|(p, _)| p) + .unwrap_or(keb); + + if let Some(i) = remainder.find(word) { + ret += &remainder[..i]; + remainder = &remainder[i..]; + } + + let mut new_word = String::new(); + for c in word.chars() { + if remainder.starts_with(c) { + remainder = remainder.strip_prefix(c).unwrap(); + new_word.push(c); + } else { + eprintln!("!!!! Char {} is not in remainder !!!!", c); + } + } + let word = &new_word; + + if !Charset::new(word).intersects(&self.chars) { + ret += word; + continue; + } + + let reb = match reb { + Some(reb) if reb.starts_with('#') => { + let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); + if let Some(ent) = ents.iter().find(|ent| { + let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); + ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() + }) { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + reb.text().unwrap().trim() + } else { + println!("- entry id not found: {}", reb); + ret += &word; + continue; + } + } + Some(reb) => reb, + None => { + let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); + let matches = ents + .iter() + .map(|ent| { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + reb.text().unwrap().trim() + }) + .collect::<HashSet<_>>(); + if matches.len() == 1 { + *matches.iter().next().unwrap() + } else { + println!("- word without reb: {}", word); + ret += &word; + continue; + } + } + }; + + //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb); + let common_cnt = word + .chars() + .zip(keb.chars()) + .take_while(|(x, y)| x == y) + .count(); + if common_cnt == 0 { + // Strange cases + write!(&mut ret, "[[{}||{}]]", word, reb).unwrap(); + continue; + } + + let keb_suffix = keb.chars().skip(common_cnt).collect::<String>(); + let word_suffix = word.chars().skip(common_cnt).collect::<String>(); + let reb = reb + .strip_suffix(&keb_suffix) + .or(reb.strip_suffix(&word_suffix)) + .unwrap_or(reb); + //println!(" common reb: {}, common word: {}", reb, word.chars().take(common_cnt).collect::<String>()); + + let wchars = Vec::from_iter(word.chars().take(common_cnt)); + let rchars = Vec::from_iter(reb.chars()); + + // We shall invoke Levhenstein distance + let mut dynrow0 = vec![(0, 0, 0, false)]; + for ri in 0..rchars.len() { + dynrow0.push((0, ri, 100 + ri + 1, false)); + } + let mut dyntab = vec![dynrow0]; + + for (wi, wc) in wchars.iter().enumerate() { + let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)]; + + for (ri, rc) in rchars.iter().enumerate() { + let mut x = vec![]; + if dyntab[wi][ri + 1].3 { + x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true)); + } + if dynrow[ri].3 { + x.push((wi + 1, ri, dynrow[ri].2 + 1, true)); + } + if wc == rc { + x.push((wi, ri, dyntab[wi][ri].2, false)); + } else { + x.push((wi, ri, dyntab[wi][ri].2 + 1, true)); + } + dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap()); + } + dyntab.push(dynrow); + } + //eprintln!("DYN TAB: {:?}", dyntab); + + let mut path = vec![(wchars.len(), rchars.len())]; + loop { + let (wi, ri) = *path.last().unwrap(); + let (wi2, ri2, _, _) = dyntab[wi][ri]; + path.push((wi2, ri2)); + if wi2 == 0 && ri2 == 0 { + break; + } + } + path.reverse(); + //eprintln!("DYN PATH: {:?}", path); + + let mut wbuf = String::new(); + let mut rbuf = String::new(); + for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) { + if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] { + if !wbuf.is_empty() || !rbuf.is_empty() { + write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); + wbuf.clear(); + rbuf.clear(); + } + ret.push(wchars[wi1]); + } else { + if wi2 > wi1 { + wbuf.push(wchars[wi1]); + } + if ri2 > ri1 { + rbuf.push(rchars[ri1]); + } + } + } + if !wbuf.is_empty() || !rbuf.is_empty() { + write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); + } + + ret += &word_suffix; + } + ret += remainder; + + // CHECK + let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); + let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); + if self.ja != back_to_ja { + eprintln!("!!!! {} != {}", self.ja, back_to_ja); + } + + self.furigana = Some(ret); + } + + pub fn furigana_markup(&self) -> String { + if let Some(furi) = &self.furigana { + furi.replace("[[", "<ruby>") + .replace("||", "<rt>") + .replace("]]", "</rt></ruby>") + } else { + self.ja.to_string() + } + } +} + +pub fn expl_clean_word(w: &str) -> (&str, Option<&str>) { + let mut ret = w; + for delim in ['(', '{', '['] { + if let Some((s, _)) = ret.split_once(delim) { + ret = s; + } + } + let p = w + .split_once('(') + .and_then(|(_, r)| r.split_once(')')) + .map(|(p, _)| p); + (ret, p) +} diff --git a/src/format.rs b/src/format.rs index a556677..caed70a 100644 --- a/src/format.rs +++ b/src/format.rs @@ -1,9 +1,9 @@ -use std::collections::HashSet; use std::fs; use anyhow::Result; use crate::charset::Charset; +use crate::example::expl_clean_word; use crate::*; // ===================================================================== @@ -62,16 +62,26 @@ fn format_batch_aux<'a>( for ex in batch.examples.iter() { writeln!(f, "<hr />")?; - write!(f, r#"<p class="ja">"#)?; - for c in ex.ja.chars() { - if batch.chars.contains(c) { - write!(f, r#"<span class="char_cur">{}</span>"#, c)?; + write!(f, r#"<p class="ja ja_main">"#)?; + let furi = ex.furigana_markup(); + for c in furi.chars() { + let class = if batch.chars.contains(c) { + Some("char_cur") } else if batch.chars_p1.contains(c) { - write!(f, r#"<span class="char_p1">{}</span>"#, c)?; + Some("char_p1") } else if batch.chars_p2.contains(c) { - write!(f, r#"<span class="char_p2">{}</span>"#, c)?; + Some("char_p2") } else if batch.chars_bad.contains(c) { - write!(f, r#"<span class="char_bad">{}</span>"#, c)?; + Some("char_bad") + } else { + None + }; + if let Some(cls) = class { + write!( + f, + r#"<a href="https://jisho.org/search/{}%20%23kanji" class="{}">{}</a>"#, + c, cls, c + )?; } else { write!(f, "{}", c)?; } @@ -140,25 +150,17 @@ fn format_batch_aux<'a>( writeln!( f, - r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"# + r#"<p><strong>Extra examples (reading practice)</strong></p><table class="extratable">"# )?; for ex in batch.extra_examples.iter() { - let furi = format_ex_furigana(dict_idx, ex); - // println!( - // "FURIGANA: {}\n => {}", - // ex.ja, - // format_ex_furigana(dict_idx, ex) - // ); writeln!( f, r#"<tr><td><div class="extra_example"><div class="extra_ja font_ja">{}</div><div class="extra_en">{}</div></div></td></tr>"#, - furi.replace("[[", "<ruby>") - .replace("||", "<rt>") - .replace("]]", "</rt></ruby>"), + ex.furigana_markup(), ex.en )?; } - writeln!(f, r#"</table></details>"#)?; + writeln!(f, r#"</table>"#)?; writeln!(f, "<hr />")?; writeln!(f, "<p>\(≧▽≦)/</p>")?; @@ -172,208 +174,21 @@ fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> if !vocab.is_empty() { writeln!( f, - r#"<details><summary>{}</summary><table class="vocabtable">"#, + r#"<p><strong>{}</strong></p><table class="vocabtable">"#, t )?; for v in vocab { writeln!( f, - r#"<tr><td>{}</td><td> <span class="tab_large font_ja">{}</span> </td><td>{}</td><td class="font_ja">{}</td></tr>"#, + r#"<tr><td>{}</td><td style="word-break: keep-all"> <span class="tab_large font_ja">{}</span> </td><td>{}</td><td class="font_ja" style="word-break: keep-all">{}</td></tr>"#, v.level, v.kanji, v.en, v.kana )?; } - writeln!(f, "</table></details>")?; + writeln!(f, "</table>")?; } Ok(()) } -fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String { - use std::fmt::Write; - - let mut remainder = ex.ja.as_str(); - let mut ret = String::new(); - - for word in ex.expl.split(|c| c == ' ' || c == '~') { - let (keb, reb) = expl_clean_word(word); - let word = word - .split_once('{') - .and_then(|(_, r)| r.split_once('}')) - .map(|(p, _)| p) - .unwrap_or(keb); - - if let Some(i) = remainder.find(word) { - ret += &remainder[..i]; - remainder = &remainder[i..]; - } - - let mut new_word = String::new(); - for c in word.chars() { - if remainder.starts_with(c) { - remainder = remainder.strip_prefix(c).unwrap(); - new_word.push(c); - } else { - eprintln!("!!!! Char {} is not in remainder !!!!", c); - } - } - let word = &new_word; - - if !Charset::new(word).intersects(&ex.chars) { - ret += word; - continue; - } - - let reb = match reb { - Some(reb) if reb.starts_with('#') => { - let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); - if let Some(ent) = ents.iter().find(|ent| { - let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); - ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() - }) { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() - } else { - println!("- entry id not found: {}", reb); - ret += &word; - continue; - } - } - Some(reb) => reb, - None => { - let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); - let matches = ents - .iter() - .map(|ent| { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() - }) - .collect::<HashSet<_>>(); - if matches.len() == 1 { - *matches.iter().next().unwrap() - } else { - println!("- word without reb: {}", word); - ret += &word; - continue; - } - } - }; - - //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb); - let common_cnt = word - .chars() - .zip(keb.chars()) - .take_while(|(x, y)| x == y) - .count(); - if common_cnt == 0 { - // Strange cases - write!(&mut ret, "[[{}||{}]]", word, reb).unwrap(); - continue; - } - - let keb_suffix = keb.chars().skip(common_cnt).collect::<String>(); - let word_suffix = word.chars().skip(common_cnt).collect::<String>(); - let reb = reb - .strip_suffix(&keb_suffix) - .or(reb.strip_suffix(&word_suffix)) - .unwrap_or(reb); - //println!(" common reb: {}, common word: {}", reb, word.chars().take(common_cnt).collect::<String>()); - - let wchars = Vec::from_iter(word.chars().take(common_cnt)); - let rchars = Vec::from_iter(reb.chars()); - - // We shall invoke Levhenstein distance - let mut dynrow0 = vec![(0, 0, 0, false)]; - for ri in 0..rchars.len() { - dynrow0.push((0, ri, 100 + ri + 1, false)); - } - let mut dyntab = vec![dynrow0]; - - for (wi, wc) in wchars.iter().enumerate() { - let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)]; - - for (ri, rc) in rchars.iter().enumerate() { - let mut x = vec![]; - if dyntab[wi][ri + 1].3 { - x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true)); - } - if dynrow[ri].3 { - x.push((wi + 1, ri, dynrow[ri].2 + 1, true)); - } - if wc == rc { - x.push((wi, ri, dyntab[wi][ri].2, false)); - } else { - x.push((wi, ri, dyntab[wi][ri].2 + 1, true)); - } - dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap()); - } - dyntab.push(dynrow); - } - //eprintln!("DYN TAB: {:?}", dyntab); - - let mut path = vec![(wchars.len(), rchars.len())]; - loop { - let (wi, ri) = *path.last().unwrap(); - let (wi2, ri2, _, _) = dyntab[wi][ri]; - path.push((wi2, ri2)); - if wi2 == 0 && ri2 == 0 { - break; - } - } - path.reverse(); - //eprintln!("DYN PATH: {:?}", path); - - let mut wbuf = String::new(); - let mut rbuf = String::new(); - for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) { - if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] { - if !wbuf.is_empty() || !rbuf.is_empty() { - write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); - wbuf.clear(); - rbuf.clear(); - } - ret.push(wchars[wi1]); - } else { - if wi2 > wi1 { - wbuf.push(wchars[wi1]); - } - if ri2 > ri1 { - rbuf.push(rchars[ri1]); - } - } - } - if !wbuf.is_empty() || !rbuf.is_empty() { - write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); - } - - ret += &word_suffix; - } - ret += remainder; - - // CHECK - let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); - let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); - if ex.ja != back_to_ja { - eprintln!("!!!! {} != {}", ex.ja, back_to_ja); - } - - ret -} - -fn expl_clean_word(w: &str) -> (&str, Option<&str>) { - let mut ret = w; - for delim in ['(', '{', '['] { - if let Some((s, _)) = ret.split_once(delim) { - ret = s; - } - } - let p = w - .split_once('(') - .and_then(|(_, r)| r.split_once(')')) - .map(|(p, _)| p); - (ret, p) -} - fn dict_str_short<'a>( qkeb: &str, qreb: Option<&str>, diff --git a/src/main.rs b/src/main.rs index b8996e8..85b278a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,6 +10,7 @@ use structopt::StructOpt; mod charset; mod datafiles; +mod example; mod format; use charset::Charset; use datafiles::*; @@ -36,6 +37,7 @@ enum Cmd { Cleanup, AddVocab, AddExamples, + AddFurigana, Format, } @@ -70,73 +72,46 @@ fn main() { .collect::<Vec<_>>(); let mut ex = read_examples(&all_kanji).expect("read_examples"); ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); - let mut batches: Vec<Batch> = fs::read("data/batches.json") - .map_err(anyhow::Error::from) - .and_then(|x| Ok(serde_json::from_slice(&x)?)) - .unwrap_or_default(); + + let mut batches = read_batches().unwrap_or_default(); + if let Some(t) = truncate { batches.truncate(t); } println!("---- starting after {} batches ----", batches.len()); let target_len = batches.len() + count; gen_batches(&mut batches, target_len, &kanji_levels, &ex); - fs::write( - "data/batches.json", - serde_json::to_string_pretty(&batches) - .expect("serialize") - .as_bytes(), - ) - .expect("save"); + + save_batches(batches).expect("save_batches"); } Cmd::Simplify => { - let mut batches: Vec<Batch> = fs::read("data/batches.json") - .map_err(anyhow::Error::from) - .and_then(|x| Ok(serde_json::from_slice(&x)?)) - .expect("failed to decode batches.json"); + let mut batches = read_batches().expect("read_batches"); + for batch in batches.iter_mut() { simplify_batch(batch); } - fs::write( - "data/batches.json", - serde_json::to_string_pretty(&batches) - .expect("serialize") - .as_bytes(), - ) - .expect("save"); + + save_batches(batches).expect("save_batches"); } Cmd::Cleanup => { - let mut batches: Vec<Batch> = fs::read("data/batches.json") - .map_err(anyhow::Error::from) - .and_then(|x| Ok(serde_json::from_slice(&x)?)) - .expect("failed to decode batches.json"); + let mut batches = read_batches().expect("read_batches"); + let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let kanji_levels = kanji_levels .into_iter() .map(|(l, x)| (l, Charset::new(x))) .collect::<Vec<_>>(); cleanup_batches(&mut batches, &kanji_levels); - fs::write( - "data/batches.json", - serde_json::to_string_pretty(&batches) - .expect("serialize") - .as_bytes(), - ) - .expect("save"); + + save_batches(batches).expect("save_batches"); } Cmd::AddVocab => { - let mut batches: Vec<Batch> = fs::read("data/batches.json") - .map_err(anyhow::Error::from) - .and_then(|x| Ok(serde_json::from_slice(&x)?)) - .expect("failed to decode batches.json"); + let mut batches = read_batches().expect("read_batches"); + let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab"); add_vocab(&mut batches, &jlpt_vocab); - fs::write( - "data/batches.json", - serde_json::to_string_pretty(&batches) - .expect("serialize") - .as_bytes(), - ) - .expect("save"); + + save_batches(batches).expect("save_batches"); } Cmd::AddExamples => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); @@ -151,20 +126,37 @@ fn main() { let mut ex = read_examples(&all_kanji).expect("read_examples"); ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); - let mut batches: Vec<Batch> = fs::read("data/batches.json") - .map_err(anyhow::Error::from) - .and_then(|x| Ok(serde_json::from_slice(&x)?)) - .expect("failed to decode batches.json"); + let mut batches = read_batches().expect("read_batches"); add_extra_examples(&mut batches, &ex); - fs::write( - "data/batches.json", - serde_json::to_string_pretty(&batches) - .expect("serialize") - .as_bytes(), + save_batches(batches).expect("save_batches"); + } + Cmd::AddFurigana => { + let mut batches = read_batches().expect("read_batches"); + + let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); + let jmdict = roxmltree::Document::parse_with_options( + &jmdict, + roxmltree::ParsingOptions { + allow_dtd: true, + ..Default::default() + }, ) - .expect("save"); + .expect("parse_jmdict"); + let jmdict_idx = index_jmdict(&jmdict); + + for batch in batches.iter_mut() { + for ex in batch + .examples + .iter_mut() + .chain(batch.extra_examples.iter_mut()) + { + ex.gen_furigana(&jmdict_idx); + } + } + + save_batches(batches).expect("save_batches"); } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); @@ -178,10 +170,7 @@ fn main() { .expect("parse_jmdict"); let jmdict_idx = index_jmdict(&jmdict); - let batches = fs::read("data/batches.json") - .map_err(anyhow::Error::from) - .and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?)) - .expect("read/parse"); + let batches = read_batches().expect("read/parse"); fs::create_dir_all("public").expect("mkdir public"); fs::copy("static/style.css", "public/style.css").expect("copy style.css"); @@ -200,6 +189,21 @@ fn main() { } } +// ---- + +fn read_batches() -> anyhow::Result<Vec<Batch>> { + let json = fs::read("data/batches.json")?; + Ok(serde_json::from_slice::<Vec<Batch>>(&json)?) +} + +fn save_batches(batches: Vec<Batch>) -> anyhow::Result<()> { + fs::write( + "data/batches.json", + serde_json::to_string_pretty(&batches)?.as_bytes(), + )?; + Ok(()) +} + // ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== |