aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-28 16:31:28 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-28 16:31:28 +0100
commitab232ceb32b51ac8553692cf8a2b1f86fa975f7d (patch)
treeb159521d949ebe8e2e7913a8a7c39d116a32ffb2 /src
parentb78034ad5bf65f1dfe390861f72bed827e2ab1b8 (diff)
downloaddatagengo-ab232ceb32b51ac8553692cf8a2b1f86fa975f7d.tar.gz
datagengo-ab232ceb32b51ac8553692cf8a2b1f86fa975f7d.zip
add furigana override file to fix edge cases
Diffstat (limited to 'src')
-rw-r--r--src/datafiles.rs20
-rw-r--r--src/example.rs9
-rw-r--r--src/main.rs5
3 files changed, 31 insertions, 3 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs
index 0e526ef..3065fbf 100644
--- a/src/datafiles.rs
+++ b/src/datafiles.rs
@@ -176,6 +176,26 @@ pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
Ok(ret)
}
+pub fn read_furigana_overrides() -> Result<HashMap<String, String>> {
+ let file = fs::File::open("data/furigana_overrides")?;
+
+ let mut ret = HashMap::new();
+ let re = regex::Regex::new(r#"\|\|\w+\]\]"#)?;
+
+ for line in io::BufReader::new(file).lines() {
+ let line = line?;
+ let line = line.trim();
+ if !line.is_empty() {
+ let clean = re.replace_all(line, "").replace("[[", "");
+ if clean != line {
+ ret.insert(clean, line.to_string());
+ }
+ }
+ }
+
+ Ok(ret)
+}
+
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct JlptVocab {
pub level: String,
diff --git a/src/example.rs b/src/example.rs
index 71f3f13..770ac8a 100644
--- a/src/example.rs
+++ b/src/example.rs
@@ -1,12 +1,17 @@
-use std::collections::HashSet;
+use std::collections::{HashSet, HashMap};
use crate::charset::Charset;
use crate::*;
impl Example {
- pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>) {
+ pub fn gen_furigana<'a>(&mut self, dict_idx: &DictIndex<'a>, overrides: &HashMap<String, String>) {
use std::fmt::Write;
+ if let Some(v) = overrides.get(&self.ja) {
+ self.furigana = Some(v.to_string());
+ return;
+ }
+
let mut remainder = self.ja.as_str();
let mut ret = String::new();
diff --git a/src/main.rs b/src/main.rs
index 85b278a..b0c46c0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -146,16 +146,19 @@ fn main() {
.expect("parse_jmdict");
let jmdict_idx = index_jmdict(&jmdict);
+ let overrides = read_furigana_overrides().expect("read_furigana_overrides");
+
for batch in batches.iter_mut() {
for ex in batch
.examples
.iter_mut()
.chain(batch.extra_examples.iter_mut())
{
- ex.gen_furigana(&jmdict_idx);
+ ex.gen_furigana(&jmdict_idx, &overrides);
}
}
+
save_batches(batches).expect("save_batches");
}
Cmd::Format => {