use std::collections::HashMap;
use std::fs;
use std::io::{self, BufRead};
use anyhow::Result;
use serde::{Deserialize, Serialize};
use crate::charset::Charset;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct Example {
pub ja: String,
pub en: String,
pub expl: String,
#[serde(default)]
pub furigana: Option<String>,
pub id: Option<String>,
pub chars: Charset,
}
// =====================================================================
// PARSING DATA FILES
// =====================================================================
pub type DictEntry<'a> = roxmltree::Node<'a, 'a>;
pub type DictIndex<'a> = HashMap<&'a str, Vec<DictEntry<'a>>>;
pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
let dict = dict
.root()
.children()
.find(|x| x.has_tag_name("JMdict"))
.unwrap();
let mut ret: DictIndex<'a> = HashMap::new();
for x in dict.children().filter(|x| x.has_tag_name("entry")) {
for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
let txt = keb.text().unwrap().trim();
ret.entry(txt).or_default().push(x);
}
}
}
ret
}
pub fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());
let file = fs::read_to_string("data/kanjidic2.xml")?;
let xml = roxmltree::Document::parse(&file)?;
let kanjidic = xml.root().first_child().unwrap();
assert!(kanjidic.has_tag_name("kanjidic2"));
let mut levels = HashMap::new();
for x in kanjidic.children() {
if !x.has_tag_name("character") {
continue;
}
let mut literal = None;
let mut jlpt = None;
let mut grade = None;
for y in x.children() {
if y.has_tag_name("literal") {
literal = y.text();
}
if y.has_tag_name("misc") {
for z in y.children() {
if z.has_tag_name("jlpt") {
jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
}
if z.has_tag_name("grade") {
grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
}
}
}
}
match grade {
Some(i) if i <= 6 => grade = Some(7),
_ => (),
}
if let Some(lit) = literal {
assert_eq!(lit.chars().count(), 1);
let jlpt = match jlpt {
Some(4) => Some(5),
Some(3) => Some(4),
Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3),
x => x,
};
levels
.entry((jlpt, grade))
.or_insert(String::new())
.extend(lit.chars());
}
}
let mut levels = levels.into_iter().collect::<Vec<_>>();
levels.sort_by_key(|((j, g), _)| match (j, g) {
(Some(j), Some(g)) => (10 - *j) * 20 + *g,
(Some(j), None) => (10 - *j) * 20 + 15,
(None, Some(g)) => 1000 + *g,
(None, None) => 1015,
});
let mut ret = Vec::new();
let mut pc = Charset::default();
for ((j, g), chars) in levels.into_iter() {
let name = match (j, g) {
(Some(j), Some(7)) => format!("N{}a", j),
(Some(j), Some(8)) => format!("N{}b", j),
(Some(j), Some(g)) => format!("N{}-{}", j, g),
(Some(j), None) => format!("N{}+", j),
(None, Some(7)) => format!("N0a"),
(None, Some(8)) => format!("N0b"),
(None, Some(g)) => format!("N0-{}", g),
(None, None) => format!("N0+"),
};
let chars = Charset::new(chars).diff(&pc);
pc = pc.union(&chars);
ret.push((name, chars));
}
Ok(ret)
}
pub fn read_kanji_levels() -> Result<Vec<(String, String)>> {
Ok(fs::read_to_string("data/kanji_levels.txt")?
.lines()
.filter_map(|l| l.split_once(": "))
.map(|(l, k)| (l.to_string(), k.to_string()))
.collect::<Vec<_>>())
}
pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
let file = fs::File::open("data/examples.utf")?;
let mut ret = Vec::new();
let mut a = "".to_string();
for (i, line) in io::BufReader::new(file).lines().enumerate() {
let line = line?;
if line.starts_with("A:") {
a = line;
} else if line.starts_with("B:") {
let s = a.strip_prefix("A: ");
let t = line.strip_prefix("B: ");
if let (Some(a), Some(b)) = (s, t) {
if let Some((ja, eng)) = a.split_once("\t") {
if let Some((eng, id)) = eng.split_once("#") {
ret.push(Example {
ja: ja.to_string(),
en: eng.to_string(),
expl: b.to_string(),
id: Some(id.to_string()),
chars: Charset::new(ja).inter(all_kanji),
furigana: None,
});
} else {
ret.push(Example {
ja: ja.to_string(),
en: eng.to_string(),
expl: b.to_string(),
id: None,
chars: Charset::new(ja).inter(all_kanji),
furigana: None,
});
}
}
}
}
if i % 10000 == 0 {
eprintln!("read examples: {}/300 (x1000)", i / 1000);
}
}
Ok(ret)
}
pub fn read_furigana_overrides() -> Result<HashMap<String, String>> {
let file = fs::File::open("data/furigana_overrides")?;
let mut ret = HashMap::new();
let re = regex::Regex::new(r#"\|\|\w+\]\]"#)?;
for line in io::BufReader::new(file).lines() {
let line = line?;
let line = line.trim();
if !line.is_empty() {
let clean = re.replace_all(line, "").replace("[[", "");
if clean != line {
ret.insert(clean, line.to_string());
}
}
}
Ok(ret)
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct JlptVocab {
pub level: String,
pub chars: Charset,
pub kanji: String,
pub kana: String,
pub en: String,
}
impl JlptVocab {
pub fn to_string(&self) -> String {
format!(
"{}\t{}\t{}\t{}\t{}",
self.level,
self.chars.to_string(),
self.kanji,
self.kana,
self.en
)
}
}
pub fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
let mut vocab = vec![];
vocab.extend(parse_jlpt_vocab_combined(
"data/n5_vocab.txt",
"N5",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
"data/n4_vocab_hiragana.txt",
"data/n4_vocab_eng.txt",
"N4",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
"data/n3_vocab_hiragana.txt",
"data/n3_vocab_eng.txt",
"N3",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
"data/n2_vocab_hiragana.txt",
"data/n2_vocab_eng.txt",
"N2",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
"data/n1_vocab_hiragana.txt",
"data/n1_vocab_eng.txt",
"N1",
all_kanji,
)?);
for v in vocab.iter() {
println!("{}", v.to_string());
}
Ok(())
}
fn parse_jlpt_vocab_combined(
file: &str,
level: &str,
all_kanji: &Charset,
) -> Result<Vec<JlptVocab>> {
let lines = jlpt_vocab_read_file(file)?;
let mut ret = vec![];
for (kanji, answer) in lines {
let (eng, kana) = match answer.split_once('\n') {
Some((a, b)) => (a, b.trim()),
None => (answer.trim(), ""),
};
for kanji in kanji.split('/') {
ret.push(JlptVocab {
level: level.to_string(),
chars: Charset::new(kanji).inter(all_kanji),
kanji: kanji.to_string(),
kana: kana.to_string(),
en: eng.to_string(),
});
}
}
Ok(ret)
}
fn parse_jlpt_vocab_split(
kana_file: &str,
eng_file: &str,
level: &str,
all_kanji: &Charset,
) -> Result<Vec<JlptVocab>> {
let eng_lines = jlpt_vocab_read_file(eng_file)?
.into_iter()
.collect::<HashMap<String, String>>();
let lines = jlpt_vocab_read_file(kana_file)?;
let mut ret = vec![];
for (kanji, kana) in lines {
let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
if let Some(eng) = eng {
for kanji in kanji.split('/') {
ret.push(JlptVocab {
level: level.to_string(),
chars: Charset::new(kanji).inter(all_kanji),
kanji: kanji.to_string(),
kana: kana.to_string(),
en: eng.to_string(),
});
}
}
}
Ok(ret)
}
fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
let re = regex::Regex::new(r#"<span class="\w+">"#)?;
let file = fs::File::open(file)?;
let mut ret = vec![];
for line in io::BufReader::new(file).lines() {
let line = line?.replace("<br>", "\n").replace("</span>", "");
let line = re.replace_all(&line, "");
if let Some((a, b)) = line.split_once('|') {
ret.push((a.trim().to_string(), b.trim().to_string()));
}
}
Ok(ret)
}
pub fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
let file = fs::File::open("data/jlpt_vocab.txt")?;
let mut ret = vec![];
for line in io::BufReader::new(file).lines() {
let line = line?;
let line = line.splitn(5, "\t").collect::<Vec<_>>();
if line.len() == 5 {
ret.push(JlptVocab {
level: line[0].to_string(),
chars: Charset::new(line[1]),
kanji: line[2].to_string(),
kana: line[3].to_string(),
en: line[4].to_string(),
});
}
}
Ok(ret)
}