From ab232ceb32b51ac8553692cf8a2b1f86fa975f7d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 16:31:28 +0100 Subject: add furigana override file to fix edge cases --- src/datafiles.rs | 20 ++++++++++++++++++++ src/example.rs | 9 +++++++-- src/main.rs | 5 ++++- 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/datafiles.rs b/src/datafiles.rs index 0e526ef..3065fbf 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -176,6 +176,26 @@ pub fn read_examples(all_kanji: &Charset) -> Result> { Ok(ret) } +pub fn read_furigana_overrides() -> Result> { + let file = fs::File::open("data/furigana_overrides")?; + + let mut ret = HashMap::new(); + let re = regex::Regex::new(r#"\|\|\w+\]\]"#)?; + + for line in io::BufReader::new(file).lines() { + let line = line?; + let line = line.trim(); + if !line.is_empty() { + let clean = re.replace_all(line, "").replace("[[", ""); + if clean != line { + ret.insert(clean, line.to_string()); + } + } + } + + Ok(ret) +} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct JlptVocab { pub level: String, diff --git a/src/example.rs b/src/example.rs index 71f3f13..770ac8a 100644 --- a/src/example.rs +++ b/src/example.rs @@ -1,12 +1,17 @@ -use std::collections::HashSet; +use std::collections::{HashSet, HashMap}; use crate::charset::Charset; use crate::*; impl Example { - pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>) { + pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>, overrides: &HashMap) { use std::fmt::Write; + if let Some(v) = overrides.get(&self.ja) { + self.furigana = Some(v.to_string()); + return; + } + let mut remainder = self.ja.as_str(); let mut ret = String::new(); diff --git a/src/main.rs b/src/main.rs index 85b278a..b0c46c0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -146,16 +146,19 @@ fn main() { .expect("parse_jmdict"); let jmdict_idx = index_jmdict(&jmdict); + let overrides = read_furigana_overrides().expect("read_furigana_overrides"); + for batch in batches.iter_mut() { for ex in batch .examples .iter_mut() .chain(batch.extra_examples.iter_mut()) { - ex.gen_furigana(&jmdict_idx); + ex.gen_furigana(&jmdict_idx, &overrides); } } + save_batches(batches).expect("save_batches"); } Cmd::Format => { -- cgit v1.2.3