aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-28 16:31:28 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-28 16:31:28 +0100
commitab232ceb32b51ac8553692cf8a2b1f86fa975f7d (patch)
treeb159521d949ebe8e2e7913a8a7c39d116a32ffb2
parentb78034ad5bf65f1dfe390861f72bed827e2ab1b8 (diff)
downloaddatagengo-ab232ceb32b51ac8553692cf8a2b1f86fa975f7d.tar.gz
datagengo-ab232ceb32b51ac8553692cf8a2b1f86fa975f7d.zip
add furigana override file to fix edge cases
-rw-r--r--data/batches.json2
-rw-r--r--data/furigana_overrides1
-rw-r--r--src/datafiles.rs20
-rw-r--r--src/example.rs9
-rw-r--r--src/main.rs5
5 files changed, 33 insertions, 4 deletions
diff --git a/data/batches.json b/data/batches.json
index e01dd28..dcb3730 100644
--- a/data/batches.json
+++ b/data/batches.json
@@ -31916,7 +31916,7 @@
"ja": "受け付け係は私に無理矢理紙に署名させた。",
"en": "The receptionist forced me to sign my name on the paper.",
"expl": "受付係{受け付け係} は 私(わたし)[01] に 無理やり{無理矢理} 紙(かみ) に 署名 為せる{させた}",
- "furigana": "[[受||うけつけがかり]]け付け係は[[私||わたし]]に[[無理||むり]]矢理[[紙||かみ]]に[[署名||しょめい]]させた。",
+ "furigana": "[[受||う]]け[[付||つ]]け[[係||がかり]]は[[私||わたし]]に[[無理||むり]]矢理[[紙||かみ]]に[[署名||しょめい]]させた。",
"id": "ID=266187_148372",
"chars": "付係受名無理矢私紙署"
},
diff --git a/data/furigana_overrides b/data/furigana_overrides
new file mode 100644
index 0000000..46c3c46
--- /dev/null
+++ b/data/furigana_overrides
@@ -0,0 +1 @@
+[[受||う]]け[[付||つ]]け[[係||がかり]]は[[私||わたし]]に[[無理||むり]]矢理[[紙||かみ]]に[[署名||しょめい]]させた。
diff --git a/src/datafiles.rs b/src/datafiles.rs
index 0e526ef..3065fbf 100644
--- a/src/datafiles.rs
+++ b/src/datafiles.rs
@@ -176,6 +176,26 @@ pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
Ok(ret)
}
+pub fn read_furigana_overrides() -> Result<HashMap<String, String>> {
+ let file = fs::File::open("data/furigana_overrides")?;
+
+ let mut ret = HashMap::new();
+ let re = regex::Regex::new(r#"\|\|\w+\]\]"#)?;
+
+ for line in io::BufReader::new(file).lines() {
+ let line = line?;
+ let line = line.trim();
+ if !line.is_empty() {
+ let clean = re.replace_all(line, "").replace("[[", "");
+ if clean != line {
+ ret.insert(clean, line.to_string());
+ }
+ }
+ }
+
+ Ok(ret)
+}
+
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct JlptVocab {
pub level: String,
diff --git a/src/example.rs b/src/example.rs
index 71f3f13..770ac8a 100644
--- a/src/example.rs
+++ b/src/example.rs
@@ -1,12 +1,17 @@
-use std::collections::HashSet;
+use std::collections::{HashSet, HashMap};
use crate::charset::Charset;
use crate::*;
impl Example {
- pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>) {
+ pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>, overrides: &HashMap<String, String>) {
use std::fmt::Write;
+ if let Some(v) = overrides.get(&self.ja) {
+ self.furigana = Some(v.to_string());
+ return;
+ }
+
let mut remainder = self.ja.as_str();
let mut ret = String::new();
diff --git a/src/main.rs b/src/main.rs
index 85b278a..b0c46c0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -146,16 +146,19 @@ fn main() {
.expect("parse_jmdict");
let jmdict_idx = index_jmdict(&jmdict);
+ let overrides = read_furigana_overrides().expect("read_furigana_overrides");
+
for batch in batches.iter_mut() {
for ex in batch
.examples
.iter_mut()
.chain(batch.extra_examples.iter_mut())
{
- ex.gen_furigana(&jmdict_idx);
+ ex.gen_furigana(&jmdict_idx, &overrides);
}
}
+
save_batches(batches).expect("save_batches");
}
Cmd::Format => {