diff options
Diffstat (limited to 'src/main.rs')
-rw-r--r-- | src/main.rs | 664 |
1 files changed, 6 insertions, 658 deletions
diff --git a/src/main.rs b/src/main.rs index 4ec20d0..5d6b7d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,19 @@ use std::collections::HashMap; use std::fs; -use std::io::{self, BufRead, Write}; +use std::io::{self, Write}; -use anyhow::Result; +//use anyhow::Result; use rand::prelude::*; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use structopt::StructOpt; mod charset; +mod datafiles; +mod format; use charset::Charset; +use datafiles::*; +use format::*; #[derive(Debug, StructOpt)] #[structopt(name = "datagengo", about = "Japanese example practice maker")] @@ -195,325 +199,12 @@ fn main() { } // ===================================================================== -// PARSING DATA FILES -// ===================================================================== - -type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>; -fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { - let dict = dict - .root() - .children() - .find(|x| x.has_tag_name("JMdict")) - .unwrap(); - - let mut ret: DictIndex<'a> = HashMap::new(); - for x in dict.children().filter(|x| x.has_tag_name("entry")) { - for r in x.children().filter(|x| x.has_tag_name("k_ele")) { - if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { - let txt = keb.text().unwrap().trim(); - ret.entry(txt).or_default().push(x); - } - } - } - - ret -} - -fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { - let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); - - let file = fs::read_to_string("data/kanjidic2.xml")?; - let xml = roxmltree::Document::parse(&file)?; - let kanjidic = xml.root().first_child().unwrap(); - assert!(kanjidic.has_tag_name("kanjidic2")); - - let mut levels = HashMap::new(); - - for x in kanjidic.children() { - if !x.has_tag_name("character") { - continue; - } - let mut literal = None; - let mut jlpt = None; - let mut grade = None; - for y in x.children() { - if y.has_tag_name("literal") { - literal = y.text(); - } - if y.has_tag_name("misc") { - for z in y.children() { - if z.has_tag_name("jlpt") { - jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok()); - } - if z.has_tag_name("grade") { - grade = z.text().and_then(|x| str::parse::<i32>(x).ok()); - } - } - } - } - match grade { - Some(i) if i <= 6 => grade = Some(7), - _ => (), - } - if let Some(lit) = literal { - assert_eq!(lit.chars().count(), 1); - let jlpt = match jlpt { - Some(4) => Some(5), - Some(3) => Some(4), - Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3), - x => x, - }; - levels - .entry((jlpt, grade)) - .or_insert(String::new()) - .extend(lit.chars()); - } - } - - let mut levels = levels.into_iter().collect::<Vec<_>>(); - levels.sort_by_key(|((j, g), _)| match (j, g) { - (Some(j), Some(g)) => (10 - *j) * 20 + *g, - (Some(j), None) => (10 - *j) * 20 + 15, - (None, Some(g)) => 1000 + *g, - (None, None) => 1015, - }); - - let mut ret = Vec::new(); - let mut pc = Charset::default(); - for ((j, g), chars) in levels.into_iter() { - let name = match (j, g) { - (Some(j), Some(7)) => format!("N{}a", j), - (Some(j), Some(8)) => format!("N{}b", j), - (Some(j), Some(g)) => format!("N{}-{}", j, g), - (Some(j), None) => format!("N{}+", j), - (None, Some(7)) => format!("N0a"), - (None, Some(8)) => format!("N0b"), - (None, Some(g)) => format!("N0-{}", g), - (None, None) => format!("N0+"), - }; - let chars = Charset::new(chars).diff(&pc); - pc = pc.union(&chars); - ret.push((name, chars)); - } - - Ok(ret) -} - -fn read_kanji_levels() -> Result<Vec<(String, String)>> { - Ok(fs::read_to_string("data/kanji_levels.txt")? - .lines() - .filter_map(|l| l.split_once(": ")) - .map(|(l, k)| (l.to_string(), k.to_string())) - .collect::<Vec<_>>()) -} - -fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { - let file = fs::File::open("data/examples.utf")?; - - let mut ret = Vec::new(); - let mut a = "".to_string(); - - for (i, line) in io::BufReader::new(file).lines().enumerate() { - let line = line?; - if line.starts_with("A:") { - a = line; - } else if line.starts_with("B:") { - let s = a.strip_prefix("A: "); - let t = line.strip_prefix("B: "); - if let (Some(a), Some(b)) = (s, t) { - if let Some((ja, eng)) = a.split_once("\t") { - if let Some((eng, id)) = eng.split_once("#") { - ret.push(Example { - ja: ja.to_string(), - en: eng.to_string(), - expl: b.to_string(), - id: Some(id.to_string()), - chars: Charset::new(ja).inter(all_kanji), - }); - } else { - ret.push(Example { - ja: ja.to_string(), - en: eng.to_string(), - expl: b.to_string(), - id: None, - chars: Charset::new(ja).inter(all_kanji), - }); - } - } - } - } - if i % 10000 == 0 { - eprintln!("read examples: {}/300 (x1000)", i / 1000); - } - } - - Ok(ret) -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -struct JlptVocab { - level: String, - chars: Charset, - kanji: String, - kana: String, - en: String, -} - -impl JlptVocab { - fn to_string(&self) -> String { - format!( - "{}\t{}\t{}\t{}\t{}", - self.level, - self.chars.to_string(), - self.kanji, - self.kana, - self.en - ) - } -} - -fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { - let mut vocab = vec![]; - vocab.extend(parse_jlpt_vocab_combined( - "data/n5_vocab.txt", - "N5", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n4_vocab_hiragana.txt", - "data/n4_vocab_eng.txt", - "N4", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n3_vocab_hiragana.txt", - "data/n3_vocab_eng.txt", - "N3", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n2_vocab_hiragana.txt", - "data/n2_vocab_eng.txt", - "N2", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n1_vocab_hiragana.txt", - "data/n1_vocab_eng.txt", - "N1", - all_kanji, - )?); - for v in vocab.iter() { - println!("{}", v.to_string()); - } - Ok(()) -} - -fn parse_jlpt_vocab_combined( - file: &str, - level: &str, - all_kanji: &Charset, -) -> Result<Vec<JlptVocab>> { - let lines = jlpt_vocab_read_file(file)?; - let mut ret = vec![]; - for (kanji, answer) in lines { - let (eng, kana) = match answer.split_once('\n') { - Some((a, b)) => (a, b.trim()), - None => (answer.trim(), ""), - }; - for kanji in kanji.split('/') { - ret.push(JlptVocab { - level: level.to_string(), - chars: Charset::new(kanji).inter(all_kanji), - kanji: kanji.to_string(), - kana: kana.to_string(), - en: eng.to_string(), - }); - } - } - Ok(ret) -} - -fn parse_jlpt_vocab_split( - kana_file: &str, - eng_file: &str, - level: &str, - all_kanji: &Charset, -) -> Result<Vec<JlptVocab>> { - let eng_lines = jlpt_vocab_read_file(eng_file)? - .into_iter() - .collect::<HashMap<String, String>>(); - - let lines = jlpt_vocab_read_file(kana_file)?; - let mut ret = vec![]; - for (kanji, kana) in lines { - let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana)); - if let Some(eng) = eng { - for kanji in kanji.split('/') { - ret.push(JlptVocab { - level: level.to_string(), - chars: Charset::new(kanji).inter(all_kanji), - kanji: kanji.to_string(), - kana: kana.to_string(), - en: eng.to_string(), - }); - } - } - } - Ok(ret) -} - -fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> { - let re = regex::Regex::new(r#"<span class="\w+">"#)?; - - let file = fs::File::open(file)?; - let mut ret = vec![]; - for line in io::BufReader::new(file).lines() { - let line = line?.replace("<br>", "\n").replace("</span>", ""); - let line = re.replace_all(&line, ""); - if let Some((a, b)) = line.split_once('|') { - ret.push((a.trim().to_string(), b.trim().to_string())); - } - } - - Ok(ret) -} - -fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> { - let file = fs::File::open("data/jlpt_vocab.txt")?; - let mut ret = vec![]; - for line in io::BufReader::new(file).lines() { - let line = line?; - let line = line.splitn(5, "\t").collect::<Vec<_>>(); - if line.len() == 5 { - ret.push(JlptVocab { - level: line[0].to_string(), - chars: Charset::new(line[1]), - kanji: line[2].to_string(), - kana: line[3].to_string(), - en: line[4].to_string(), - }); - } - } - Ok(ret) -} - -// ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== const CHARS_PER_BATCH: usize = 20; const MAX_NEW_CHARS_PER_EX: usize = 5; -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -struct Example { - ja: String, - en: String, - expl: String, - id: Option<String>, - chars: Charset, -} - #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] struct Batch { level: String, @@ -1109,346 +800,3 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { ); } } - -// ===================================================================== -// FORMATTING TO HTML -// ===================================================================== - -fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { - format_batch_aux(dict_idx, count, i, batch).expect("format batch"); -} - -fn format_batch_aux<'a>( - dict_idx: &DictIndex<'a>, - count: usize, - i: usize, - batch: &Batch, -) -> Result<()> { - let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?); - write!( - f, - r#"<!DOCTYPE html> - <html> - <head> - <meta charset=\"UTF-8\" /> - <title>Batch #{:03}</title> - <link rel="stylesheet" type="text/css" href="style.css" /> - </head> - <body><div class="batch_page">"#, - i - )?; - - writeln!(f, r#"<p><a href="index.html">index</a>"#)?; - for j in 0..count { - if j != i { - writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?; - } else { - writeln!(f, " {:03}", j)?; - } - } - writeln!(f, r#"</p>"#)?; - writeln!(f, "<p>Level: {}</p>", batch.level)?; - - write!(f, r#"<p class="ja">"#)?; - let mut ex_prev = Charset::default(); - for ex in batch.examples.iter() { - let ex_chars = ex.chars.inter(&batch.chars); - for c in ex_chars.diff(&ex_prev).chars().iter() { - write!( - f, - r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, - c, c - )?; - } - ex_prev = ex_prev.union(&ex_chars); - } - writeln!(f, r#"</p>"#)?; - - for ex in batch.examples.iter() { - writeln!(f, "<hr />")?; - write!(f, r#"<p class="ja">"#)?; - for c in ex.ja.chars() { - if batch.chars.contains(c) { - write!(f, r#"<span class="char_cur">{}</span>"#, c)?; - } else if batch.chars_p1.contains(c) { - write!(f, r#"<span class="char_p1">{}</span>"#, c)?; - } else if batch.chars_p2.contains(c) { - write!(f, r#"<span class="char_p2">{}</span>"#, c)?; - } else if batch.chars_bad.contains(c) { - write!(f, r#"<span class="char_bad">{}</span>"#, c)?; - } else { - write!(f, "{}", c)?; - } - } - writeln!(f, "</p>")?; - writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?; - - writeln!(f, r#"<details><summary>Explanation</summary>"#)?; - let mut expl_batch = Vec::new(); - let mut expl_all = Vec::new(); - for word in ex.expl.split(|c| c == ' ' || c == '~') { - let (keb, reb) = expl_clean_word(word); - let wchars = Charset::new(keb); - if !wchars.intersects(&ex.chars) { - continue; - } - if let Some(ents) = dict_idx.get(keb) { - for ent in ents.iter() { - if let Some(s) = dict_str(keb, reb, ent) { - if wchars.intersects(&batch.chars) { - expl_batch.push(s); - } else { - expl_all.push(s); - } - } - } - } - } - for be in expl_batch { - writeln!(f, r#"<p>{}</p>"#, be)?; - } - writeln!(f, r#"<p class="chars">"#)?; - for c in ex.chars.inter(&batch.chars).chars().iter() { - writeln!( - f, - r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, - c, c - )?; - } - writeln!(f, r#"</p>"#)?; - for be in expl_all { - writeln!(f, r#"<p>{}</p>"#, be)?; - } - writeln!(f, r#"</details>"#)?; - } - - writeln!(f, "<hr />")?; - format_vocab( - &mut f, - &batch - .extra_vocab - .iter() - .filter(|v| batch.level.contains(&v.level)) - .collect::<Vec<_>>(), - "Extra vocabulary (this level)", - )?; - format_vocab( - &mut f, - &batch - .extra_vocab - .iter() - .filter(|v| !batch.level.contains(&v.level)) - .collect::<Vec<_>>(), - "Extra vocabulary (previous levels)", - )?; - - writeln!( - f, - r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"# - )?; - for ex in batch.extra_examples.iter() { - let mut expl1 = Vec::new(); - let mut expl2 = Vec::new(); - for word in ex.expl.split(|c| c == ' ' || c == '~') { - let (keb, reb) = expl_clean_word(word); - let wchars = Charset::new(keb); - if !wchars.intersects(&ex.chars) { - continue; - } - if let Some(ents) = dict_idx.get(keb) { - for ent in ents.iter() { - if let Some(s) = dict_str_short(keb, reb, ent) { - if wchars.intersects(&batch.chars) { - expl1.push(s); - } else { - expl2.push(s); - } - } - } - } - } - expl1.extend(expl2.into_iter()); - let expl = expl1.join("<br />"); - writeln!( - f, - r#"<tr><td><details><summary class="tab_large2 font_ja"> {} </summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#, - ex.ja, ex.en, expl - )?; - } - writeln!(f, r#"</table></details>"#)?; - - writeln!(f, "<hr />")?; - writeln!(f, "<p>\(≧▽≦)/</p>")?; - - write!(f, "<div></body></html>")?; - f.flush()?; - Ok(()) -} - -fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> { - if !vocab.is_empty() { - writeln!( - f, - r#"<details><summary>{}</summary><table class="vocabtable">"#, - t - )?; - for v in vocab { - writeln!( - f, - r#"<tr><td>{}</td><td> <span class="tab_large font_ja">{}</span> </td><td>{}</td><td class="font_ja">{}</td></tr>"#, - v.level, v.kanji, v.en, v.kana - )?; - } - writeln!(f, "</table></details>")?; - } - Ok(()) -} - -fn expl_clean_word(w: &str) -> (&str, Option<&str>) { - let mut ret = w; - for delim in ['(', '{', '['] { - if let Some((s, _)) = ret.split_once(delim) { - ret = s; - } - } - let p = w - .split_once('(') - .and_then(|(_, r)| r.split_once(')')) - .map(|(p, _)| p); - (ret, p) -} - -fn dict_str_short<'a>( - qkeb: &str, - qreb: Option<&str>, - ent: &roxmltree::Node<'a, 'a>, -) -> Option<String> { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - let reb = reb.text().unwrap().trim(); - - if qreb.map(|x| x != reb).unwrap_or(false) { - return None; - } - - Some(format!( - r#"<span class="font_ja">{} 【{}】</span>"#, - qkeb, reb - )) -} - -fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> { - let mut ret = dict_str_short(qkeb, qreb, ent)?; - - for sense in ent.children().filter(|x| x.has_tag_name("sense")) { - if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { - ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); - } - } - - if ret.chars().rev().next() == Some(';') { - ret.pop(); - } - Some(ret) -} - -fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { - let mut f = io::BufWriter::new(fs::File::create("public/index.html")?); - write!( - f, - r#"<!DOCTYPE html> - <html> - <head> - <meta charset=\"UTF-8\" /> - <title>List of batches</title> - <link rel="stylesheet" type="text/css" href="style.css" /> - </head> - <body><div class="index_page">"# - )?; - - writeln!(f, r#"<p><a href="about.html">About / How-to</a></p><hr />"#)?; - - writeln!(f, "<table>")?; - writeln!(f, "<tr><th>Num</th><th>Level</th><th>Kanji</th><th>Examples</th><th>Lesson-1</th><th>Lesson-2</th><th>Ignore</th></tr>")?; - for (i, batch) in batches.iter().enumerate() { - writeln!( - f, - r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td class="font_ja">{}</td><td> {}</td><td class="font_ja">{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td></tr>"#, - i, - i, - batch.level, - batch.chars.to_string(), - batch.examples.len(), - batch.chars_p1.to_string(), - batch.chars_p2.to_string(), - batch.chars_bad.to_string() - )?; - } - writeln!(f, r#"</table>"#)?; - - writeln!(f, "<hr />")?; - - let all_chars = Charset::from_iter( - batches - .iter() - .map(|x| x.chars.chars().iter().copied()) - .flatten(), - ); - writeln!(f, "<table>")?; - writeln!( - f, - r#"<tr><th>Level</th><th>Count</th><th width="60%">Kanji</th><th>Missing kanji</th></tr>"# - )?; - for (lvl, chars) in kanji_levels.iter() { - if lvl == "N0+" || lvl.ends_with("-10") { - continue; - } - let chars = Charset::new(chars); - let missing = chars.diff(&all_chars); - writeln!( - f, - r#"<tr><td>{}</td><td>{}</td><td class="font_ja">{}</td><td><span class="font_ja">{}</span> ({})</td></tr>"#, - lvl, - chars.len(), - chars.to_string(), - missing.to_string(), - missing.len() - )?; - } - writeln!(f, "</table>")?; - - write!(f, "</div></body></html>")?; - f.flush()?; - Ok(()) -} - -fn format_about() -> Result<()> { - let mut f = io::BufWriter::new(fs::File::create("public/about.html")?); - write!( - f, - r#"<!DOCTYPE html> - <html> - <head> - <meta charset=\"UTF-8\" /> - <title>Datagengo README</title> - <link rel="stylesheet" type="text/css" href="style.css" /> - </head> - <body>"# - )?; - - writeln!(f, r#"<div class="about_page">"#)?; - writeln!( - f, - r#"<p><a href="index.html">Back to lessons</a></p><hr />"# - )?; - - writeln!( - f, - "{}", - markdown::to_html(&fs::read_to_string("README.md")?) - )?; - - writeln!(f, r#"</div></body></html>"#)?; - - Ok(()) -} |