use std::collections::{HashMap, HashSet}; use crate::charset::Charset; use crate::*; impl Example { pub fn gen_furigana<'a>( &mut self, dict_idx: &DictIndex<'a>, overrides: &HashMap, ) { use std::fmt::Write; if let Some(v) = overrides.get(&self.ja) { self.furigana = Some(v.to_string()); return; } let mut remainder = self.ja.as_str(); let mut ret = String::new(); for word in self.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); let word = word .split_once('{') .and_then(|(_, r)| r.split_once('}')) .map(|(p, _)| p) .unwrap_or(keb); if let Some(i) = remainder.find(word) { ret += &remainder[..i]; remainder = &remainder[i..]; } let mut new_word = String::new(); for c in word.chars() { if remainder.starts_with(c) { remainder = remainder.strip_prefix(c).unwrap(); new_word.push(c); } else { eprintln!("!!!! Char {} is not in remainder !!!!", c); } } let word = &new_word; if !Charset::new(word).intersects(&self.chars) { ret += word; continue; } let reb = match reb { Some(reb) if reb.starts_with('#') => { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); if let Some(ent) = ents.iter().find(|ent| { let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() }) { let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); reb.text().unwrap().trim() } else { println!("- entry id not found: {}", reb); ret += &word; continue; } } Some(reb) => reb, None => { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); let matches = ents .iter() .map(|ent| { let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); reb.text().unwrap().trim() }) .collect::>(); if matches.len() == 1 { *matches.iter().next().unwrap() } else { println!("- word without reb: {}", word); ret += &word; continue; } } }; //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb); let common_cnt = word .chars() .zip(keb.chars()) .take_while(|(x, y)| x == y) .count(); if common_cnt == 0 { // Strange cases write!(&mut ret, "[[{}||{}]]", word, reb).unwrap(); continue; } let keb_suffix = keb.chars().skip(common_cnt).collect::(); let word_suffix = word.chars().skip(common_cnt).collect::(); let reb = reb .strip_suffix(&keb_suffix) .or(reb.strip_suffix(&word_suffix)) .unwrap_or(reb); //println!(" common reb: {}, common word: {}", reb, word.chars().take(common_cnt).collect::()); let wchars = Vec::from_iter(word.chars().take(common_cnt)); let rchars = Vec::from_iter(reb.chars()); // We shall invoke Levhenstein distance let mut dynrow0 = vec![(0, 0, 0, false)]; for ri in 0..rchars.len() { dynrow0.push((0, ri, 100 + ri + 1, false)); } let mut dyntab = vec![dynrow0]; for (wi, wc) in wchars.iter().enumerate() { let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)]; for (ri, rc) in rchars.iter().enumerate() { let mut x = vec![]; if dyntab[wi][ri + 1].3 { x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true)); } if dynrow[ri].3 { x.push((wi + 1, ri, dynrow[ri].2 + 1, true)); } if wc == rc { x.push((wi, ri, dyntab[wi][ri].2, false)); } else { x.push((wi, ri, dyntab[wi][ri].2 + 1, true)); } dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap()); } dyntab.push(dynrow); } //eprintln!("DYN TAB: {:?}", dyntab); let mut path = vec![(wchars.len(), rchars.len())]; loop { let (wi, ri) = *path.last().unwrap(); let (wi2, ri2, _, _) = dyntab[wi][ri]; path.push((wi2, ri2)); if wi2 == 0 && ri2 == 0 { break; } } path.reverse(); //eprintln!("DYN PATH: {:?}", path); let mut wbuf = String::new(); let mut rbuf = String::new(); for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) { if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] { if !wbuf.is_empty() || !rbuf.is_empty() { write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); wbuf.clear(); rbuf.clear(); } ret.push(wchars[wi1]); } else { if wi2 > wi1 { wbuf.push(wchars[wi1]); } if ri2 > ri1 { rbuf.push(rchars[ri1]); } } } if !wbuf.is_empty() || !rbuf.is_empty() { write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); } ret += &word_suffix; } ret += remainder; // CHECK let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); if self.ja != back_to_ja { eprintln!("!!!! {} != {}", self.ja, back_to_ja); } self.furigana = Some(ret); } pub fn furigana_markup(&self) -> String { if let Some(furi) = &self.furigana { furi.replace("[[", "") .replace("||", "") .replace("]]", "") } else { self.ja.to_string() } } } pub fn expl_clean_word(w: &str) -> (&str, Option<&str>) { let mut ret = w; for delim in ['(', '{', '['] { if let Some((s, _)) = ret.split_once(delim) { ret = s; } } let p = w .split_once('(') .and_then(|(_, r)| r.split_once(')')) .map(|(p, _)| p); (ret, p) }