From ab232ceb32b51ac8553692cf8a2b1f86fa975f7d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 16:31:28 +0100 Subject: add furigana override file to fix edge cases --- data/batches.json | 2 +- data/furigana_overrides | 1 + src/datafiles.rs | 20 ++++++++++++++++++++ src/example.rs | 9 +++++++-- src/main.rs | 5 ++++- 5 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 data/furigana_overrides diff --git a/data/batches.json b/data/batches.json index e01dd28..dcb3730 100644 --- a/data/batches.json +++ b/data/batches.json @@ -31916,7 +31916,7 @@ "ja": "受け付け係は私に無理矢理紙に署名させた。", "en": "The receptionist forced me to sign my name on the paper.", "expl": "受付係{受け付け係} は 私(わたし)[01] に 無理やり{無理矢理} 紙(かみ) に 署名 為せる{させた}", - "furigana": "[[受||うけつけがかり]]け付け係は[[私||わたし]]に[[無理||むり]]矢理[[紙||かみ]]に[[署名||しょめい]]させた。", + "furigana": "[[受||う]]け[[付||つ]]け[[係||がかり]]は[[私||わたし]]に[[無理||むり]]矢理[[紙||かみ]]に[[署名||しょめい]]させた。", "id": "ID=266187_148372", "chars": "付係受名無理矢私紙署" }, diff --git a/data/furigana_overrides b/data/furigana_overrides new file mode 100644 index 0000000..46c3c46 --- /dev/null +++ b/data/furigana_overrides @@ -0,0 +1 @@ +[[受||う]]け[[付||つ]]け[[係||がかり]]は[[私||わたし]]に[[無理||むり]]矢理[[紙||かみ]]に[[署名||しょめい]]させた。 diff --git a/src/datafiles.rs b/src/datafiles.rs index 0e526ef..3065fbf 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -176,6 +176,26 @@ pub fn read_examples(all_kanji: &Charset) -> Result> { Ok(ret) } +pub fn read_furigana_overrides() -> Result> { + let file = fs::File::open("data/furigana_overrides")?; + + let mut ret = HashMap::new(); + let re = regex::Regex::new(r#"\|\|\w+\]\]"#)?; + + for line in io::BufReader::new(file).lines() { + let line = line?; + let line = line.trim(); + if !line.is_empty() { + let clean = re.replace_all(line, "").replace("[[", ""); + if clean != line { + ret.insert(clean, line.to_string()); + } + } + } + + Ok(ret) +} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct JlptVocab { pub level: String, diff --git a/src/example.rs b/src/example.rs index 71f3f13..770ac8a 100644 --- a/src/example.rs +++ b/src/example.rs @@ -1,12 +1,17 @@ -use std::collections::HashSet; +use std::collections::{HashSet, HashMap}; use crate::charset::Charset; use crate::*; impl Example { - pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>) { + pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>, overrides: &HashMap) { use std::fmt::Write; + if let Some(v) = overrides.get(&self.ja) { + self.furigana = Some(v.to_string()); + return; + } + let mut remainder = self.ja.as_str(); let mut ret = String::new(); diff --git a/src/main.rs b/src/main.rs index 85b278a..b0c46c0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -146,16 +146,19 @@ fn main() { .expect("parse_jmdict"); let jmdict_idx = index_jmdict(&jmdict); + let overrides = read_furigana_overrides().expect("read_furigana_overrides"); + for batch in batches.iter_mut() { for ex in batch .examples .iter_mut() .chain(batch.extra_examples.iter_mut()) { - ex.gen_furigana(&jmdict_idx); + ex.gen_furigana(&jmdict_idx, &overrides); } } + save_batches(batches).expect("save_batches"); } Cmd::Format => { -- cgit v1.2.3