aboutsummaryrefslogtreecommitdiff
path: root/src/datafiles.rs
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-27 17:26:59 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-27 17:26:59 +0100
commitd2a46c25219c21ac4f128da8512302935654d38e (patch)
treea6d66ac4639e4d68fe57f9e8da72b08ecfb14d9f /src/datafiles.rs
parentb15723f33b486124a50408873d30998bb9d31b3b (diff)
downloaddatagengo-d2a46c25219c21ac4f128da8512302935654d38e.tar.gz
datagengo-d2a46c25219c21ac4f128da8512302935654d38e.zip
split code into several files
Diffstat (limited to 'src/datafiles.rs')
-rw-r--r--src/datafiles.rs321
1 files changed, 321 insertions, 0 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs
new file mode 100644
index 0000000..629badf
--- /dev/null
+++ b/src/datafiles.rs
@@ -0,0 +1,321 @@
+use std::collections::HashMap;
+use std::fs;
+use std::io::{self, BufRead};
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+
+use crate::charset::Charset;
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct Example {
+ pub ja: String,
+ pub en: String,
+ pub expl: String,
+ pub id: Option<String>,
+ pub chars: Charset,
+}
+
+// =====================================================================
+// PARSING DATA FILES
+// =====================================================================
+
+pub type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
+pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
+ let dict = dict
+ .root()
+ .children()
+ .find(|x| x.has_tag_name("JMdict"))
+ .unwrap();
+
+ let mut ret: DictIndex<'a> = HashMap::new();
+ for x in dict.children().filter(|x| x.has_tag_name("entry")) {
+ for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
+ if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
+ let txt = keb.text().unwrap().trim();
+ ret.entry(txt).or_default().push(x);
+ }
+ }
+ }
+
+ ret
+}
+
+pub fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
+ let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());
+
+ let file = fs::read_to_string("data/kanjidic2.xml")?;
+ let xml = roxmltree::Document::parse(&file)?;
+ let kanjidic = xml.root().first_child().unwrap();
+ assert!(kanjidic.has_tag_name("kanjidic2"));
+
+ let mut levels = HashMap::new();
+
+ for x in kanjidic.children() {
+ if !x.has_tag_name("character") {
+ continue;
+ }
+ let mut literal = None;
+ let mut jlpt = None;
+ let mut grade = None;
+ for y in x.children() {
+ if y.has_tag_name("literal") {
+ literal = y.text();
+ }
+ if y.has_tag_name("misc") {
+ for z in y.children() {
+ if z.has_tag_name("jlpt") {
+ jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ if z.has_tag_name("grade") {
+ grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
+ }
+ }
+ }
+ match grade {
+ Some(i) if i <= 6 => grade = Some(7),
+ _ => (),
+ }
+ if let Some(lit) = literal {
+ assert_eq!(lit.chars().count(), 1);
+ let jlpt = match jlpt {
+ Some(4) => Some(5),
+ Some(3) => Some(4),
+ Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3),
+ x => x,
+ };
+ levels
+ .entry((jlpt, grade))
+ .or_insert(String::new())
+ .extend(lit.chars());
+ }
+ }
+
+ let mut levels = levels.into_iter().collect::<Vec<_>>();
+ levels.sort_by_key(|((j, g), _)| match (j, g) {
+ (Some(j), Some(g)) => (10 - *j) * 20 + *g,
+ (Some(j), None) => (10 - *j) * 20 + 15,
+ (None, Some(g)) => 1000 + *g,
+ (None, None) => 1015,
+ });
+
+ let mut ret = Vec::new();
+ let mut pc = Charset::default();
+ for ((j, g), chars) in levels.into_iter() {
+ let name = match (j, g) {
+ (Some(j), Some(7)) => format!("N{}a", j),
+ (Some(j), Some(8)) => format!("N{}b", j),
+ (Some(j), Some(g)) => format!("N{}-{}", j, g),
+ (Some(j), None) => format!("N{}+", j),
+ (None, Some(7)) => format!("N0a"),
+ (None, Some(8)) => format!("N0b"),
+ (None, Some(g)) => format!("N0-{}", g),
+ (None, None) => format!("N0+"),
+ };
+ let chars = Charset::new(chars).diff(&pc);
+ pc = pc.union(&chars);
+ ret.push((name, chars));
+ }
+
+ Ok(ret)
+}
+
+pub fn read_kanji_levels() -> Result<Vec<(String, String)>> {
+ Ok(fs::read_to_string("data/kanji_levels.txt")?
+ .lines()
+ .filter_map(|l| l.split_once(": "))
+ .map(|(l, k)| (l.to_string(), k.to_string()))
+ .collect::<Vec<_>>())
+}
+
+pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
+ let file = fs::File::open("data/examples.utf")?;
+
+ let mut ret = Vec::new();
+ let mut a = "".to_string();
+
+ for (i, line) in io::BufReader::new(file).lines().enumerate() {
+ let line = line?;
+ if line.starts_with("A:") {
+ a = line;
+ } else if line.starts_with("B:") {
+ let s = a.strip_prefix("A: ");
+ let t = line.strip_prefix("B: ");
+ if let (Some(a), Some(b)) = (s, t) {
+ if let Some((ja, eng)) = a.split_once("\t") {
+ if let Some((eng, id)) = eng.split_once("#") {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: Some(id.to_string()),
+ chars: Charset::new(ja).inter(all_kanji),
+ });
+ } else {
+ ret.push(Example {
+ ja: ja.to_string(),
+ en: eng.to_string(),
+ expl: b.to_string(),
+ id: None,
+ chars: Charset::new(ja).inter(all_kanji),
+ });
+ }
+ }
+ }
+ }
+ if i % 10000 == 0 {
+ eprintln!("read examples: {}/300 (x1000)", i / 1000);
+ }
+ }
+
+ Ok(ret)
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+pub struct JlptVocab {
+ pub level: String,
+ pub chars: Charset,
+ pub kanji: String,
+ pub kana: String,
+ pub en: String,
+}
+
+impl JlptVocab {
+ pub fn to_string(&self) -> String {
+ format!(
+ "{}\t{}\t{}\t{}\t{}",
+ self.level,
+ self.chars.to_string(),
+ self.kanji,
+ self.kana,
+ self.en
+ )
+ }
+}
+
+pub fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
+ let mut vocab = vec![];
+ vocab.extend(parse_jlpt_vocab_combined(
+ "data/n5_vocab.txt",
+ "N5",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n4_vocab_hiragana.txt",
+ "data/n4_vocab_eng.txt",
+ "N4",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n3_vocab_hiragana.txt",
+ "data/n3_vocab_eng.txt",
+ "N3",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n2_vocab_hiragana.txt",
+ "data/n2_vocab_eng.txt",
+ "N2",
+ all_kanji,
+ )?);
+ vocab.extend(parse_jlpt_vocab_split(
+ "data/n1_vocab_hiragana.txt",
+ "data/n1_vocab_eng.txt",
+ "N1",
+ all_kanji,
+ )?);
+ for v in vocab.iter() {
+ println!("{}", v.to_string());
+ }
+ Ok(())
+}
+
+fn parse_jlpt_vocab_combined(
+ file: &str,
+ level: &str,
+ all_kanji: &Charset,
+) -> Result<Vec<JlptVocab>> {
+ let lines = jlpt_vocab_read_file(file)?;
+ let mut ret = vec![];
+ for (kanji, answer) in lines {
+ let (eng, kana) = match answer.split_once('\n') {
+ Some((a, b)) => (a, b.trim()),
+ None => (answer.trim(), ""),
+ };
+ for kanji in kanji.split('/') {
+ ret.push(JlptVocab {
+ level: level.to_string(),
+ chars: Charset::new(kanji).inter(all_kanji),
+ kanji: kanji.to_string(),
+ kana: kana.to_string(),
+ en: eng.to_string(),
+ });
+ }
+ }
+ Ok(ret)
+}
+
+fn parse_jlpt_vocab_split(
+ kana_file: &str,
+ eng_file: &str,
+ level: &str,
+ all_kanji: &Charset,
+) -> Result<Vec<JlptVocab>> {
+ let eng_lines = jlpt_vocab_read_file(eng_file)?
+ .into_iter()
+ .collect::<HashMap<String, String>>();
+
+ let lines = jlpt_vocab_read_file(kana_file)?;
+ let mut ret = vec![];
+ for (kanji, kana) in lines {
+ let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
+ if let Some(eng) = eng {
+ for kanji in kanji.split('/') {
+ ret.push(JlptVocab {
+ level: level.to_string(),
+ chars: Charset::new(kanji).inter(all_kanji),
+ kanji: kanji.to_string(),
+ kana: kana.to_string(),
+ en: eng.to_string(),
+ });
+ }
+ }
+ }
+ Ok(ret)
+}
+
+fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
+ let re = regex::Regex::new(r#"<span class="\w+">"#)?;
+
+ let file = fs::File::open(file)?;
+ let mut ret = vec![];
+ for line in io::BufReader::new(file).lines() {
+ let line = line?.replace("<br>", "\n").replace("</span>", "");
+ let line = re.replace_all(&line, "");
+ if let Some((a, b)) = line.split_once('|') {
+ ret.push((a.trim().to_string(), b.trim().to_string()));
+ }
+ }
+
+ Ok(ret)
+}
+
+pub fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
+ let file = fs::File::open("data/jlpt_vocab.txt")?;
+ let mut ret = vec![];
+ for line in io::BufReader::new(file).lines() {
+ let line = line?;
+ let line = line.splitn(5, "\t").collect::<Vec<_>>();
+ if line.len() == 5 {
+ ret.push(JlptVocab {
+ level: line[0].to_string(),
+ chars: Charset::new(line[1]),
+ kanji: line[2].to_string(),
+ kana: line[3].to_string(),
+ en: line[4].to_string(),
+ });
+ }
+ }
+ Ok(ret)
+}