aboutsummaryrefslogtreecommitdiff
path: root/src/format.rs
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-28 16:22:16 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-28 16:22:16 +0100
commitb78034ad5bf65f1dfe390861f72bed827e2ab1b8 (patch)
treeaa42b4edc8d2a44bb5c41890089cb754ca7d40d7 /src/format.rs
parent64568528b13d08ceaa0c36c20b3aa20d966cfdcb (diff)
downloaddatagengo-b78034ad5bf65f1dfe390861f72bed827e2ab1b8.tar.gz
datagengo-b78034ad5bf65f1dfe390861f72bed827e2ab1b8.zip
add furigana to main examples and persist furigana in batches.json
Diffstat (limited to 'src/format.rs')
-rw-r--r--src/format.rs233
1 files changed, 24 insertions, 209 deletions
diff --git a/src/format.rs b/src/format.rs
index a556677..caed70a 100644
--- a/src/format.rs
+++ b/src/format.rs
@@ -1,9 +1,9 @@
-use std::collections::HashSet;
use std::fs;
use anyhow::Result;
use crate::charset::Charset;
+use crate::example::expl_clean_word;
use crate::*;
// =====================================================================
@@ -62,16 +62,26 @@ fn format_batch_aux<'a>(
for ex in batch.examples.iter() {
writeln!(f, "<hr />")?;
- write!(f, r#"<p class="ja">"#)?;
- for c in ex.ja.chars() {
- if batch.chars.contains(c) {
- write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
+ write!(f, r#"<p class="ja ja_main">"#)?;
+ let furi = ex.furigana_markup();
+ for c in furi.chars() {
+ let class = if batch.chars.contains(c) {
+ Some("char_cur")
} else if batch.chars_p1.contains(c) {
- write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
+ Some("char_p1")
} else if batch.chars_p2.contains(c) {
- write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
+ Some("char_p2")
} else if batch.chars_bad.contains(c) {
- write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
+ Some("char_bad")
+ } else {
+ None
+ };
+ if let Some(cls) = class {
+ write!(
+ f,
+ r#"<a href="https://jisho.org/search/{}%20%23kanji" class="{}">{}</a>"#,
+ c, cls, c
+ )?;
} else {
write!(f, "{}", c)?;
}
@@ -140,25 +150,17 @@ fn format_batch_aux<'a>(
writeln!(
f,
- r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"#
+ r#"<p><strong>Extra examples (reading practice)</strong></p><table class="extratable">"#
)?;
for ex in batch.extra_examples.iter() {
- let furi = format_ex_furigana(dict_idx, ex);
- // println!(
- // "FURIGANA: {}\n => {}",
- // ex.ja,
- // format_ex_furigana(dict_idx, ex)
- // );
writeln!(
f,
r#"<tr><td><div class="extra_example"><div class="extra_ja font_ja">{}</div><div class="extra_en">{}</div></div></td></tr>"#,
- furi.replace("[[", "<ruby>")
- .replace("||", "<rt>")
- .replace("]]", "</rt></ruby>"),
+ ex.furigana_markup(),
ex.en
)?;
}
- writeln!(f, r#"</table></details>"#)?;
+ writeln!(f, r#"</table>"#)?;
writeln!(f, "<hr />")?;
writeln!(f, "<p>\(≧▽≦)/</p>")?;
@@ -172,208 +174,21 @@ fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()>
if !vocab.is_empty() {
writeln!(
f,
- r#"<details><summary>{}</summary><table class="vocabtable">"#,
+ r#"<p><strong>{}</strong></p><table class="vocabtable">"#,
t
)?;
for v in vocab {
writeln!(
f,
- r#"<tr><td>{}</td><td>&nbsp;&nbsp;<span class="tab_large font_ja">{}</span>&nbsp;&nbsp;</td><td>{}</td><td class="font_ja">{}</td></tr>"#,
+ r#"<tr><td>{}</td><td style="word-break: keep-all">&nbsp;&nbsp;<span class="tab_large font_ja">{}</span>&nbsp;&nbsp;</td><td>{}</td><td class="font_ja" style="word-break: keep-all">{}</td></tr>"#,
v.level, v.kanji, v.en, v.kana
)?;
}
- writeln!(f, "</table></details>")?;
+ writeln!(f, "</table>")?;
}
Ok(())
}
-fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String {
- use std::fmt::Write;
-
- let mut remainder = ex.ja.as_str();
- let mut ret = String::new();
-
- for word in ex.expl.split(|c| c == ' ' || c == '~') {
- let (keb, reb) = expl_clean_word(word);
- let word = word
- .split_once('{')
- .and_then(|(_, r)| r.split_once('}'))
- .map(|(p, _)| p)
- .unwrap_or(keb);
-
- if let Some(i) = remainder.find(word) {
- ret += &remainder[..i];
- remainder = &remainder[i..];
- }
-
- let mut new_word = String::new();
- for c in word.chars() {
- if remainder.starts_with(c) {
- remainder = remainder.strip_prefix(c).unwrap();
- new_word.push(c);
- } else {
- eprintln!("!!!! Char {} is not in remainder !!!!", c);
- }
- }
- let word = &new_word;
-
- if !Charset::new(word).intersects(&ex.chars) {
- ret += word;
- continue;
- }
-
- let reb = match reb {
- Some(reb) if reb.starts_with('#') => {
- let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
- if let Some(ent) = ents.iter().find(|ent| {
- let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
- ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap()
- }) {
- let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
- reb.text().unwrap().trim()
- } else {
- println!("- entry id not found: {}", reb);
- ret += &word;
- continue;
- }
- }
- Some(reb) => reb,
- None => {
- let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
- let matches = ents
- .iter()
- .map(|ent| {
- let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
- reb.text().unwrap().trim()
- })
- .collect::<HashSet<_>>();
- if matches.len() == 1 {
- *matches.iter().next().unwrap()
- } else {
- println!("- word without reb: {}", word);
- ret += &word;
- continue;
- }
- }
- };
-
- //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb);
- let common_cnt = word
- .chars()
- .zip(keb.chars())
- .take_while(|(x, y)| x == y)
- .count();
- if common_cnt == 0 {
- // Strange cases
- write!(&mut ret, "[[{}||{}]]", word, reb).unwrap();
- continue;
- }
-
- let keb_suffix = keb.chars().skip(common_cnt).collect::<String>();
- let word_suffix = word.chars().skip(common_cnt).collect::<String>();
- let reb = reb
- .strip_suffix(&keb_suffix)
- .or(reb.strip_suffix(&word_suffix))
- .unwrap_or(reb);
- //println!(" common reb: {}, common word: {}", reb, word.chars().take(common_cnt).collect::<String>());
-
- let wchars = Vec::from_iter(word.chars().take(common_cnt));
- let rchars = Vec::from_iter(reb.chars());
-
- // We shall invoke Levhenstein distance
- let mut dynrow0 = vec![(0, 0, 0, false)];
- for ri in 0..rchars.len() {
- dynrow0.push((0, ri, 100 + ri + 1, false));
- }
- let mut dyntab = vec![dynrow0];
-
- for (wi, wc) in wchars.iter().enumerate() {
- let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)];
-
- for (ri, rc) in rchars.iter().enumerate() {
- let mut x = vec![];
- if dyntab[wi][ri + 1].3 {
- x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true));
- }
- if dynrow[ri].3 {
- x.push((wi + 1, ri, dynrow[ri].2 + 1, true));
- }
- if wc == rc {
- x.push((wi, ri, dyntab[wi][ri].2, false));
- } else {
- x.push((wi, ri, dyntab[wi][ri].2 + 1, true));
- }
- dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap());
- }
- dyntab.push(dynrow);
- }
- //eprintln!("DYN TAB: {:?}", dyntab);
-
- let mut path = vec![(wchars.len(), rchars.len())];
- loop {
- let (wi, ri) = *path.last().unwrap();
- let (wi2, ri2, _, _) = dyntab[wi][ri];
- path.push((wi2, ri2));
- if wi2 == 0 && ri2 == 0 {
- break;
- }
- }
- path.reverse();
- //eprintln!("DYN PATH: {:?}", path);
-
- let mut wbuf = String::new();
- let mut rbuf = String::new();
- for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) {
- if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] {
- if !wbuf.is_empty() || !rbuf.is_empty() {
- write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
- wbuf.clear();
- rbuf.clear();
- }
- ret.push(wchars[wi1]);
- } else {
- if wi2 > wi1 {
- wbuf.push(wchars[wi1]);
- }
- if ri2 > ri1 {
- rbuf.push(rchars[ri1]);
- }
- }
- }
- if !wbuf.is_empty() || !rbuf.is_empty() {
- write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
- }
-
- ret += &word_suffix;
- }
- ret += remainder;
-
- // CHECK
- let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap();
- let back_to_ja = re.replace_all(&ret, "").replace("[[", "");
- if ex.ja != back_to_ja {
- eprintln!("!!!! {} != {}", ex.ja, back_to_ja);
- }
-
- ret
-}
-
-fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
- let mut ret = w;
- for delim in ['(', '{', '['] {
- if let Some((s, _)) = ret.split_once(delim) {
- ret = s;
- }
- }
- let p = w
- .split_once('(')
- .and_then(|(_, r)| r.split_once(')'))
- .map(|(p, _)| p);
- (ret, p)
-}
-
fn dict_str_short<'a>(
qkeb: &str,
qreb: Option<&str>,