diff options
author | Alex Auvolat <alex@adnab.me> | 2023-11-28 16:31:28 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-11-28 16:31:28 +0100 |
commit | ab232ceb32b51ac8553692cf8a2b1f86fa975f7d (patch) | |
tree | b159521d949ebe8e2e7913a8a7c39d116a32ffb2 /src | |
parent | b78034ad5bf65f1dfe390861f72bed827e2ab1b8 (diff) | |
download | datagengo-ab232ceb32b51ac8553692cf8a2b1f86fa975f7d.tar.gz datagengo-ab232ceb32b51ac8553692cf8a2b1f86fa975f7d.zip |
add furigana override file to fix edge cases
Diffstat (limited to 'src')
-rw-r--r-- | src/datafiles.rs | 20 | ||||
-rw-r--r-- | src/example.rs | 9 | ||||
-rw-r--r-- | src/main.rs | 5 |
3 files changed, 31 insertions, 3 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs index 0e526ef..3065fbf 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -176,6 +176,26 @@ pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { Ok(ret) } +pub fn read_furigana_overrides() -> Result<HashMap<String, String>> { + let file = fs::File::open("data/furigana_overrides")?; + + let mut ret = HashMap::new(); + let re = regex::Regex::new(r#"\|\|\w+\]\]"#)?; + + for line in io::BufReader::new(file).lines() { + let line = line?; + let line = line.trim(); + if !line.is_empty() { + let clean = re.replace_all(line, "").replace("[[", ""); + if clean != line { + ret.insert(clean, line.to_string()); + } + } + } + + Ok(ret) +} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct JlptVocab { pub level: String, diff --git a/src/example.rs b/src/example.rs index 71f3f13..770ac8a 100644 --- a/src/example.rs +++ b/src/example.rs @@ -1,12 +1,17 @@ -use std::collections::HashSet; +use std::collections::{HashSet, HashMap}; use crate::charset::Charset; use crate::*; impl Example { - pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>) { + pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>, overrides: &HashMap<String, String>) { use std::fmt::Write; + if let Some(v) = overrides.get(&self.ja) { + self.furigana = Some(v.to_string()); + return; + } + let mut remainder = self.ja.as_str(); let mut ret = String::new(); diff --git a/src/main.rs b/src/main.rs index 85b278a..b0c46c0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -146,16 +146,19 @@ fn main() { .expect("parse_jmdict"); let jmdict_idx = index_jmdict(&jmdict); + let overrides = read_furigana_overrides().expect("read_furigana_overrides"); + for batch in batches.iter_mut() { for ex in batch .examples .iter_mut() .chain(batch.extra_examples.iter_mut()) { - ex.gen_furigana(&jmdict_idx); + ex.gen_furigana(&jmdict_idx, &overrides); } } + save_batches(batches).expect("save_batches"); } Cmd::Format => { |