aboutsummaryrefslogblamecommitdiff
path: root/src/example.rs
blob: 7d20a28e25f0666345a798800cdf07924a1dd888 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
                                         




                            




                                            

                            




                                                  

































































































































































































                                                                                                                          
use std::collections::{HashMap, HashSet};

use crate::charset::Charset;
use crate::*;

impl Example {
    pub fn gen_furigana<'a>(
        &mut self,
        dict_idx: &DictIndex<'a>,
        overrides: &HashMap<String, String>,
    ) {
        use std::fmt::Write;

        if let Some(v) = overrides.get(&self.ja) {
            self.furigana = Some(v.to_string());
            return;
        }

        let mut remainder = self.ja.as_str();
        let mut ret = String::new();

        for word in self.expl.split(|c| c == ' ' || c == '~') {
            let (keb, reb) = expl_clean_word(word);
            let word = word
                .split_once('{')
                .and_then(|(_, r)| r.split_once('}'))
                .map(|(p, _)| p)
                .unwrap_or(keb);

            if let Some(i) = remainder.find(word) {
                ret += &remainder[..i];
                remainder = &remainder[i..];
            }

            let mut new_word = String::new();
            for c in word.chars() {
                if remainder.starts_with(c) {
                    remainder = remainder.strip_prefix(c).unwrap();
                    new_word.push(c);
                } else {
                    eprintln!("!!!! Char {} is not in remainder !!!!", c);
                }
            }
            let word = &new_word;

            if !Charset::new(word).intersects(&self.chars) {
                ret += word;
                continue;
            }

            let reb = match reb {
                Some(reb) if reb.starts_with('#') => {
                    let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
                    if let Some(ent) = ents.iter().find(|ent| {
                        let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
                        ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap()
                    }) {
                        let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
                        let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
                        reb.text().unwrap().trim()
                    } else {
                        println!("- entry id not found: {}", reb);
                        ret += &word;
                        continue;
                    }
                }
                Some(reb) => reb,
                None => {
                    let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
                    let matches = ents
                        .iter()
                        .map(|ent| {
                            let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
                            let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
                            reb.text().unwrap().trim()
                        })
                        .collect::<HashSet<_>>();
                    if matches.len() == 1 {
                        *matches.iter().next().unwrap()
                    } else {
                        println!("- word without reb: {}", word);
                        ret += &word;
                        continue;
                    }
                }
            };

            //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb);
            let common_cnt = word
                .chars()
                .zip(keb.chars())
                .take_while(|(x, y)| x == y)
                .count();
            if common_cnt == 0 {
                // Strange cases
                write!(&mut ret, "[[{}||{}]]", word, reb).unwrap();
                continue;
            }

            let keb_suffix = keb.chars().skip(common_cnt).collect::<String>();
            let word_suffix = word.chars().skip(common_cnt).collect::<String>();
            let reb = reb
                .strip_suffix(&keb_suffix)
                .or(reb.strip_suffix(&word_suffix))
                .unwrap_or(reb);
            //println!("        common reb: {}, common word: {}", reb, word.chars().take(common_cnt).collect::<String>());

            let wchars = Vec::from_iter(word.chars().take(common_cnt));
            let rchars = Vec::from_iter(reb.chars());

            // We shall invoke Levhenstein distance
            let mut dynrow0 = vec![(0, 0, 0, false)];
            for ri in 0..rchars.len() {
                dynrow0.push((0, ri, 100 + ri + 1, false));
            }
            let mut dyntab = vec![dynrow0];

            for (wi, wc) in wchars.iter().enumerate() {
                let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)];

                for (ri, rc) in rchars.iter().enumerate() {
                    let mut x = vec![];
                    if dyntab[wi][ri + 1].3 {
                        x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true));
                    }
                    if dynrow[ri].3 {
                        x.push((wi + 1, ri, dynrow[ri].2 + 1, true));
                    }
                    if wc == rc {
                        x.push((wi, ri, dyntab[wi][ri].2, false));
                    } else {
                        x.push((wi, ri, dyntab[wi][ri].2 + 1, true));
                    }
                    dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap());
                }
                dyntab.push(dynrow);
            }
            //eprintln!("DYN TAB: {:?}", dyntab);

            let mut path = vec![(wchars.len(), rchars.len())];
            loop {
                let (wi, ri) = *path.last().unwrap();
                let (wi2, ri2, _, _) = dyntab[wi][ri];
                path.push((wi2, ri2));
                if wi2 == 0 && ri2 == 0 {
                    break;
                }
            }
            path.reverse();
            //eprintln!("DYN PATH: {:?}", path);

            let mut wbuf = String::new();
            let mut rbuf = String::new();
            for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) {
                if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] {
                    if !wbuf.is_empty() || !rbuf.is_empty() {
                        write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
                        wbuf.clear();
                        rbuf.clear();
                    }
                    ret.push(wchars[wi1]);
                } else {
                    if wi2 > wi1 {
                        wbuf.push(wchars[wi1]);
                    }
                    if ri2 > ri1 {
                        rbuf.push(rchars[ri1]);
                    }
                }
            }
            if !wbuf.is_empty() || !rbuf.is_empty() {
                write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
            }

            ret += &word_suffix;
        }
        ret += remainder;

        // CHECK
        let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap();
        let back_to_ja = re.replace_all(&ret, "").replace("[[", "");
        if self.ja != back_to_ja {
            eprintln!("!!!! {} != {}", self.ja, back_to_ja);
        }

        self.furigana = Some(ret);
    }

    pub fn furigana_markup(&self) -> String {
        if let Some(furi) = &self.furigana {
            furi.replace("[[", "<ruby>")
                .replace("||", "<rt>")
                .replace("]]", "</rt></ruby>")
        } else {
            self.ja.to_string()
        }
    }
}

pub fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
    let mut ret = w;
    for delim in ['(', '{', '['] {
        if let Some((s, _)) = ret.split_once(delim) {
            ret = s;
        }
    }
    let p = w
        .split_once('(')
        .and_then(|(_, r)| r.split_once(')'))
        .map(|(p, _)| p);
    (ret, p)
}