diff options
author | Alex Auvolat <alex@adnab.me> | 2023-11-27 17:26:59 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-11-27 17:26:59 +0100 |
commit | d2a46c25219c21ac4f128da8512302935654d38e (patch) | |
tree | a6d66ac4639e4d68fe57f9e8da72b08ecfb14d9f /src | |
parent | b15723f33b486124a50408873d30998bb9d31b3b (diff) | |
download | datagengo-d2a46c25219c21ac4f128da8512302935654d38e.tar.gz datagengo-d2a46c25219c21ac4f128da8512302935654d38e.zip |
split code into several files
Diffstat (limited to 'src')
-rw-r--r-- | src/datafiles.rs | 321 | ||||
-rw-r--r-- | src/format.rs | 349 | ||||
-rw-r--r-- | src/main.rs | 664 |
3 files changed, 676 insertions, 658 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs new file mode 100644 index 0000000..629badf --- /dev/null +++ b/src/datafiles.rs @@ -0,0 +1,321 @@ +use std::collections::HashMap; +use std::fs; +use std::io::{self, BufRead}; + +use anyhow::Result; +use serde::{Deserialize, Serialize}; + +use crate::charset::Charset; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Example { + pub ja: String, + pub en: String, + pub expl: String, + pub id: Option<String>, + pub chars: Charset, +} + +// ===================================================================== +// PARSING DATA FILES +// ===================================================================== + +pub type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>; +pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { + let dict = dict + .root() + .children() + .find(|x| x.has_tag_name("JMdict")) + .unwrap(); + + let mut ret: DictIndex<'a> = HashMap::new(); + for x in dict.children().filter(|x| x.has_tag_name("entry")) { + for r in x.children().filter(|x| x.has_tag_name("k_ele")) { + if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { + let txt = keb.text().unwrap().trim(); + ret.entry(txt).or_default().push(x); + } + } + } + + ret +} + +pub fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { + let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); + + let file = fs::read_to_string("data/kanjidic2.xml")?; + let xml = roxmltree::Document::parse(&file)?; + let kanjidic = xml.root().first_child().unwrap(); + assert!(kanjidic.has_tag_name("kanjidic2")); + + let mut levels = HashMap::new(); + + for x in kanjidic.children() { + if !x.has_tag_name("character") { + continue; + } + let mut literal = None; + let mut jlpt = None; + let mut grade = None; + for y in x.children() { + if y.has_tag_name("literal") { + literal = y.text(); + } + if y.has_tag_name("misc") { + for z in y.children() { + if z.has_tag_name("jlpt") { + jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok()); + } + if z.has_tag_name("grade") { + grade = z.text().and_then(|x| str::parse::<i32>(x).ok()); + } + } + } + } + match grade { + Some(i) if i <= 6 => grade = Some(7), + _ => (), + } + if let Some(lit) = literal { + assert_eq!(lit.chars().count(), 1); + let jlpt = match jlpt { + Some(4) => Some(5), + Some(3) => Some(4), + Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3), + x => x, + }; + levels + .entry((jlpt, grade)) + .or_insert(String::new()) + .extend(lit.chars()); + } + } + + let mut levels = levels.into_iter().collect::<Vec<_>>(); + levels.sort_by_key(|((j, g), _)| match (j, g) { + (Some(j), Some(g)) => (10 - *j) * 20 + *g, + (Some(j), None) => (10 - *j) * 20 + 15, + (None, Some(g)) => 1000 + *g, + (None, None) => 1015, + }); + + let mut ret = Vec::new(); + let mut pc = Charset::default(); + for ((j, g), chars) in levels.into_iter() { + let name = match (j, g) { + (Some(j), Some(7)) => format!("N{}a", j), + (Some(j), Some(8)) => format!("N{}b", j), + (Some(j), Some(g)) => format!("N{}-{}", j, g), + (Some(j), None) => format!("N{}+", j), + (None, Some(7)) => format!("N0a"), + (None, Some(8)) => format!("N0b"), + (None, Some(g)) => format!("N0-{}", g), + (None, None) => format!("N0+"), + }; + let chars = Charset::new(chars).diff(&pc); + pc = pc.union(&chars); + ret.push((name, chars)); + } + + Ok(ret) +} + +pub fn read_kanji_levels() -> Result<Vec<(String, String)>> { + Ok(fs::read_to_string("data/kanji_levels.txt")? + .lines() + .filter_map(|l| l.split_once(": ")) + .map(|(l, k)| (l.to_string(), k.to_string())) + .collect::<Vec<_>>()) +} + +pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { + let file = fs::File::open("data/examples.utf")?; + + let mut ret = Vec::new(); + let mut a = "".to_string(); + + for (i, line) in io::BufReader::new(file).lines().enumerate() { + let line = line?; + if line.starts_with("A:") { + a = line; + } else if line.starts_with("B:") { + let s = a.strip_prefix("A: "); + let t = line.strip_prefix("B: "); + if let (Some(a), Some(b)) = (s, t) { + if let Some((ja, eng)) = a.split_once("\t") { + if let Some((eng, id)) = eng.split_once("#") { + ret.push(Example { + ja: ja.to_string(), + en: eng.to_string(), + expl: b.to_string(), + id: Some(id.to_string()), + chars: Charset::new(ja).inter(all_kanji), + }); + } else { + ret.push(Example { + ja: ja.to_string(), + en: eng.to_string(), + expl: b.to_string(), + id: None, + chars: Charset::new(ja).inter(all_kanji), + }); + } + } + } + } + if i % 10000 == 0 { + eprintln!("read examples: {}/300 (x1000)", i / 1000); + } + } + + Ok(ret) +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct JlptVocab { + pub level: String, + pub chars: Charset, + pub kanji: String, + pub kana: String, + pub en: String, +} + +impl JlptVocab { + pub fn to_string(&self) -> String { + format!( + "{}\t{}\t{}\t{}\t{}", + self.level, + self.chars.to_string(), + self.kanji, + self.kana, + self.en + ) + } +} + +pub fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { + let mut vocab = vec![]; + vocab.extend(parse_jlpt_vocab_combined( + "data/n5_vocab.txt", + "N5", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n4_vocab_hiragana.txt", + "data/n4_vocab_eng.txt", + "N4", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n3_vocab_hiragana.txt", + "data/n3_vocab_eng.txt", + "N3", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n2_vocab_hiragana.txt", + "data/n2_vocab_eng.txt", + "N2", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n1_vocab_hiragana.txt", + "data/n1_vocab_eng.txt", + "N1", + all_kanji, + )?); + for v in vocab.iter() { + println!("{}", v.to_string()); + } + Ok(()) +} + +fn parse_jlpt_vocab_combined( + file: &str, + level: &str, + all_kanji: &Charset, +) -> Result<Vec<JlptVocab>> { + let lines = jlpt_vocab_read_file(file)?; + let mut ret = vec![]; + for (kanji, answer) in lines { + let (eng, kana) = match answer.split_once('\n') { + Some((a, b)) => (a, b.trim()), + None => (answer.trim(), ""), + }; + for kanji in kanji.split('/') { + ret.push(JlptVocab { + level: level.to_string(), + chars: Charset::new(kanji).inter(all_kanji), + kanji: kanji.to_string(), + kana: kana.to_string(), + en: eng.to_string(), + }); + } + } + Ok(ret) +} + +fn parse_jlpt_vocab_split( + kana_file: &str, + eng_file: &str, + level: &str, + all_kanji: &Charset, +) -> Result<Vec<JlptVocab>> { + let eng_lines = jlpt_vocab_read_file(eng_file)? + .into_iter() + .collect::<HashMap<String, String>>(); + + let lines = jlpt_vocab_read_file(kana_file)?; + let mut ret = vec![]; + for (kanji, kana) in lines { + let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana)); + if let Some(eng) = eng { + for kanji in kanji.split('/') { + ret.push(JlptVocab { + level: level.to_string(), + chars: Charset::new(kanji).inter(all_kanji), + kanji: kanji.to_string(), + kana: kana.to_string(), + en: eng.to_string(), + }); + } + } + } + Ok(ret) +} + +fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> { + let re = regex::Regex::new(r#"<span class="\w+">"#)?; + + let file = fs::File::open(file)?; + let mut ret = vec![]; + for line in io::BufReader::new(file).lines() { + let line = line?.replace("<br>", "\n").replace("</span>", ""); + let line = re.replace_all(&line, ""); + if let Some((a, b)) = line.split_once('|') { + ret.push((a.trim().to_string(), b.trim().to_string())); + } + } + + Ok(ret) +} + +pub fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> { + let file = fs::File::open("data/jlpt_vocab.txt")?; + let mut ret = vec![]; + for line in io::BufReader::new(file).lines() { + let line = line?; + let line = line.splitn(5, "\t").collect::<Vec<_>>(); + if line.len() == 5 { + ret.push(JlptVocab { + level: line[0].to_string(), + chars: Charset::new(line[1]), + kanji: line[2].to_string(), + kana: line[3].to_string(), + en: line[4].to_string(), + }); + } + } + Ok(ret) +} diff --git a/src/format.rs b/src/format.rs new file mode 100644 index 0000000..1cdde1b --- /dev/null +++ b/src/format.rs @@ -0,0 +1,349 @@ +use std::fs; + +use anyhow::Result; + +use crate::charset::Charset; +use crate::*; + +// ===================================================================== +// FORMATTING TO HTML +// ===================================================================== + +pub fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { + format_batch_aux(dict_idx, count, i, batch).expect("format batch"); +} + +fn format_batch_aux<'a>( + dict_idx: &DictIndex<'a>, + count: usize, + i: usize, + batch: &Batch, +) -> Result<()> { + let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?); + write!( + f, + r#"<!DOCTYPE html> + <html> + <head> + <meta charset=\"UTF-8\" /> + <title>Batch #{:03}</title> + <link rel="stylesheet" type="text/css" href="style.css" /> + </head> + <body><div class="batch_page">"#, + i + )?; + + writeln!(f, r#"<p><a href="index.html">index</a>"#)?; + for j in 0..count { + if j != i { + writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?; + } else { + writeln!(f, " {:03}", j)?; + } + } + writeln!(f, r#"</p>"#)?; + writeln!(f, "<p>Level: {}</p>", batch.level)?; + + write!(f, r#"<p class="ja">"#)?; + let mut ex_prev = Charset::default(); + for ex in batch.examples.iter() { + let ex_chars = ex.chars.inter(&batch.chars); + for c in ex_chars.diff(&ex_prev).chars().iter() { + write!( + f, + r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, + c, c + )?; + } + ex_prev = ex_prev.union(&ex_chars); + } + writeln!(f, r#"</p>"#)?; + + for ex in batch.examples.iter() { + writeln!(f, "<hr />")?; + write!(f, r#"<p class="ja">"#)?; + for c in ex.ja.chars() { + if batch.chars.contains(c) { + write!(f, r#"<span class="char_cur">{}</span>"#, c)?; + } else if batch.chars_p1.contains(c) { + write!(f, r#"<span class="char_p1">{}</span>"#, c)?; + } else if batch.chars_p2.contains(c) { + write!(f, r#"<span class="char_p2">{}</span>"#, c)?; + } else if batch.chars_bad.contains(c) { + write!(f, r#"<span class="char_bad">{}</span>"#, c)?; + } else { + write!(f, "{}", c)?; + } + } + writeln!(f, "</p>")?; + writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?; + + writeln!(f, r#"<details><summary>Explanation</summary>"#)?; + let mut expl_batch = Vec::new(); + let mut expl_all = Vec::new(); + for word in ex.expl.split(|c| c == ' ' || c == '~') { + let (keb, reb) = expl_clean_word(word); + let wchars = Charset::new(keb); + if !wchars.intersects(&ex.chars) { + continue; + } + if let Some(ents) = dict_idx.get(keb) { + for ent in ents.iter() { + if let Some(s) = dict_str(keb, reb, ent) { + if wchars.intersects(&batch.chars) { + expl_batch.push(s); + } else { + expl_all.push(s); + } + } + } + } + } + for be in expl_batch { + writeln!(f, r#"<p>{}</p>"#, be)?; + } + writeln!(f, r#"<p class="chars">"#)?; + for c in ex.chars.inter(&batch.chars).chars().iter() { + writeln!( + f, + r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, + c, c + )?; + } + writeln!(f, r#"</p>"#)?; + for be in expl_all { + writeln!(f, r#"<p>{}</p>"#, be)?; + } + writeln!(f, r#"</details>"#)?; + } + + writeln!(f, "<hr />")?; + format_vocab( + &mut f, + &batch + .extra_vocab + .iter() + .filter(|v| batch.level.contains(&v.level)) + .collect::<Vec<_>>(), + "Extra vocabulary (this level)", + )?; + format_vocab( + &mut f, + &batch + .extra_vocab + .iter() + .filter(|v| !batch.level.contains(&v.level)) + .collect::<Vec<_>>(), + "Extra vocabulary (previous levels)", + )?; + + writeln!( + f, + r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"# + )?; + for ex in batch.extra_examples.iter() { + let mut expl1 = Vec::new(); + let mut expl2 = Vec::new(); + for word in ex.expl.split(|c| c == ' ' || c == '~') { + let (keb, reb) = expl_clean_word(word); + let wchars = Charset::new(keb); + if !wchars.intersects(&ex.chars) { + continue; + } + if let Some(ents) = dict_idx.get(keb) { + for ent in ents.iter() { + if let Some(s) = dict_str_short(keb, reb, ent) { + if wchars.intersects(&batch.chars) { + expl1.push(s); + } else { + expl2.push(s); + } + } + } + } + } + expl1.extend(expl2.into_iter()); + let expl = expl1.join("<br />"); + writeln!( + f, + r#"<tr><td><details><summary class="tab_large2 font_ja"> {} </summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#, + ex.ja, ex.en, expl + )?; + } + writeln!(f, r#"</table></details>"#)?; + + writeln!(f, "<hr />")?; + writeln!(f, "<p>\(≧▽≦)/</p>")?; + + write!(f, "<div></body></html>")?; + f.flush()?; + Ok(()) +} + +fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> { + if !vocab.is_empty() { + writeln!( + f, + r#"<details><summary>{}</summary><table class="vocabtable">"#, + t + )?; + for v in vocab { + writeln!( + f, + r#"<tr><td>{}</td><td> <span class="tab_large font_ja">{}</span> </td><td>{}</td><td class="font_ja">{}</td></tr>"#, + v.level, v.kanji, v.en, v.kana + )?; + } + writeln!(f, "</table></details>")?; + } + Ok(()) +} + +fn expl_clean_word(w: &str) -> (&str, Option<&str>) { + let mut ret = w; + for delim in ['(', '{', '['] { + if let Some((s, _)) = ret.split_once(delim) { + ret = s; + } + } + let p = w + .split_once('(') + .and_then(|(_, r)| r.split_once(')')) + .map(|(p, _)| p); + (ret, p) +} + +fn dict_str_short<'a>( + qkeb: &str, + qreb: Option<&str>, + ent: &roxmltree::Node<'a, 'a>, +) -> Option<String> { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + let reb = reb.text().unwrap().trim(); + + if qreb.map(|x| x != reb).unwrap_or(false) { + return None; + } + + Some(format!( + r#"<span class="font_ja">{} 【{}】</span>"#, + qkeb, reb + )) +} + +fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> { + let mut ret = dict_str_short(qkeb, qreb, ent)?; + + for sense in ent.children().filter(|x| x.has_tag_name("sense")) { + if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { + ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); + } + } + + if ret.chars().rev().next() == Some(';') { + ret.pop(); + } + Some(ret) +} + +pub fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { + let mut f = io::BufWriter::new(fs::File::create("public/index.html")?); + write!( + f, + r#"<!DOCTYPE html> + <html> + <head> + <meta charset=\"UTF-8\" /> + <title>List of batches</title> + <link rel="stylesheet" type="text/css" href="style.css" /> + </head> + <body><div class="index_page">"# + )?; + + writeln!(f, r#"<p><a href="about.html">About / How-to</a></p><hr />"#)?; + + writeln!(f, "<table>")?; + writeln!(f, "<tr><th>Num</th><th>Level</th><th>Kanji</th><th>Examples</th><th>Lesson-1</th><th>Lesson-2</th><th>Ignore</th></tr>")?; + for (i, batch) in batches.iter().enumerate() { + writeln!( + f, + r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td class="font_ja">{}</td><td> {}</td><td class="font_ja">{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td></tr>"#, + i, + i, + batch.level, + batch.chars.to_string(), + batch.examples.len(), + batch.chars_p1.to_string(), + batch.chars_p2.to_string(), + batch.chars_bad.to_string() + )?; + } + writeln!(f, r#"</table>"#)?; + + writeln!(f, "<hr />")?; + + let all_chars = Charset::from_iter( + batches + .iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten(), + ); + writeln!(f, "<table>")?; + writeln!( + f, + r#"<tr><th>Level</th><th>Count</th><th width="60%">Kanji</th><th>Missing kanji</th></tr>"# + )?; + for (lvl, chars) in kanji_levels.iter() { + if lvl == "N0+" || lvl.ends_with("-10") { + continue; + } + let chars = Charset::new(chars); + let missing = chars.diff(&all_chars); + writeln!( + f, + r#"<tr><td>{}</td><td>{}</td><td class="font_ja">{}</td><td><span class="font_ja">{}</span> ({})</td></tr>"#, + lvl, + chars.len(), + chars.to_string(), + missing.to_string(), + missing.len() + )?; + } + writeln!(f, "</table>")?; + + write!(f, "</div></body></html>")?; + f.flush()?; + Ok(()) +} + +pub fn format_about() -> Result<()> { + let mut f = io::BufWriter::new(fs::File::create("public/about.html")?); + write!( + f, + r#"<!DOCTYPE html> + <html> + <head> + <meta charset=\"UTF-8\" /> + <title>Datagengo README</title> + <link rel="stylesheet" type="text/css" href="style.css" /> + </head> + <body>"# + )?; + + writeln!(f, r#"<div class="about_page">"#)?; + writeln!( + f, + r#"<p><a href="index.html">Back to lessons</a></p><hr />"# + )?; + + writeln!( + f, + "{}", + markdown::to_html(&fs::read_to_string("README.md")?) + )?; + + writeln!(f, r#"</div></body></html>"#)?; + + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index 4ec20d0..5d6b7d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,19 @@ use std::collections::HashMap; use std::fs; -use std::io::{self, BufRead, Write}; +use std::io::{self, Write}; -use anyhow::Result; +//use anyhow::Result; use rand::prelude::*; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use structopt::StructOpt; mod charset; +mod datafiles; +mod format; use charset::Charset; +use datafiles::*; +use format::*; #[derive(Debug, StructOpt)] #[structopt(name = "datagengo", about = "Japanese example practice maker")] @@ -195,325 +199,12 @@ fn main() { } // ===================================================================== -// PARSING DATA FILES -// ===================================================================== - -type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>; -fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { - let dict = dict - .root() - .children() - .find(|x| x.has_tag_name("JMdict")) - .unwrap(); - - let mut ret: DictIndex<'a> = HashMap::new(); - for x in dict.children().filter(|x| x.has_tag_name("entry")) { - for r in x.children().filter(|x| x.has_tag_name("k_ele")) { - if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { - let txt = keb.text().unwrap().trim(); - ret.entry(txt).or_default().push(x); - } - } - } - - ret -} - -fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { - let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); - - let file = fs::read_to_string("data/kanjidic2.xml")?; - let xml = roxmltree::Document::parse(&file)?; - let kanjidic = xml.root().first_child().unwrap(); - assert!(kanjidic.has_tag_name("kanjidic2")); - - let mut levels = HashMap::new(); - - for x in kanjidic.children() { - if !x.has_tag_name("character") { - continue; - } - let mut literal = None; - let mut jlpt = None; - let mut grade = None; - for y in x.children() { - if y.has_tag_name("literal") { - literal = y.text(); - } - if y.has_tag_name("misc") { - for z in y.children() { - if z.has_tag_name("jlpt") { - jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok()); - } - if z.has_tag_name("grade") { - grade = z.text().and_then(|x| str::parse::<i32>(x).ok()); - } - } - } - } - match grade { - Some(i) if i <= 6 => grade = Some(7), - _ => (), - } - if let Some(lit) = literal { - assert_eq!(lit.chars().count(), 1); - let jlpt = match jlpt { - Some(4) => Some(5), - Some(3) => Some(4), - Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3), - x => x, - }; - levels - .entry((jlpt, grade)) - .or_insert(String::new()) - .extend(lit.chars()); - } - } - - let mut levels = levels.into_iter().collect::<Vec<_>>(); - levels.sort_by_key(|((j, g), _)| match (j, g) { - (Some(j), Some(g)) => (10 - *j) * 20 + *g, - (Some(j), None) => (10 - *j) * 20 + 15, - (None, Some(g)) => 1000 + *g, - (None, None) => 1015, - }); - - let mut ret = Vec::new(); - let mut pc = Charset::default(); - for ((j, g), chars) in levels.into_iter() { - let name = match (j, g) { - (Some(j), Some(7)) => format!("N{}a", j), - (Some(j), Some(8)) => format!("N{}b", j), - (Some(j), Some(g)) => format!("N{}-{}", j, g), - (Some(j), None) => format!("N{}+", j), - (None, Some(7)) => format!("N0a"), - (None, Some(8)) => format!("N0b"), - (None, Some(g)) => format!("N0-{}", g), - (None, None) => format!("N0+"), - }; - let chars = Charset::new(chars).diff(&pc); - pc = pc.union(&chars); - ret.push((name, chars)); - } - - Ok(ret) -} - -fn read_kanji_levels() -> Result<Vec<(String, String)>> { - Ok(fs::read_to_string("data/kanji_levels.txt")? - .lines() - .filter_map(|l| l.split_once(": ")) - .map(|(l, k)| (l.to_string(), k.to_string())) - .collect::<Vec<_>>()) -} - -fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { - let file = fs::File::open("data/examples.utf")?; - - let mut ret = Vec::new(); - let mut a = "".to_string(); - - for (i, line) in io::BufReader::new(file).lines().enumerate() { - let line = line?; - if line.starts_with("A:") { - a = line; - } else if line.starts_with("B:") { - let s = a.strip_prefix("A: "); - let t = line.strip_prefix("B: "); - if let (Some(a), Some(b)) = (s, t) { - if let Some((ja, eng)) = a.split_once("\t") { - if let Some((eng, id)) = eng.split_once("#") { - ret.push(Example { - ja: ja.to_string(), - en: eng.to_string(), - expl: b.to_string(), - id: Some(id.to_string()), - chars: Charset::new(ja).inter(all_kanji), - }); - } else { - ret.push(Example { - ja: ja.to_string(), - en: eng.to_string(), - expl: b.to_string(), - id: None, - chars: Charset::new(ja).inter(all_kanji), - }); - } - } - } - } - if i % 10000 == 0 { - eprintln!("read examples: {}/300 (x1000)", i / 1000); - } - } - - Ok(ret) -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -struct JlptVocab { - level: String, - chars: Charset, - kanji: String, - kana: String, - en: String, -} - -impl JlptVocab { - fn to_string(&self) -> String { - format!( - "{}\t{}\t{}\t{}\t{}", - self.level, - self.chars.to_string(), - self.kanji, - self.kana, - self.en - ) - } -} - -fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { - let mut vocab = vec![]; - vocab.extend(parse_jlpt_vocab_combined( - "data/n5_vocab.txt", - "N5", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n4_vocab_hiragana.txt", - "data/n4_vocab_eng.txt", - "N4", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n3_vocab_hiragana.txt", - "data/n3_vocab_eng.txt", - "N3", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n2_vocab_hiragana.txt", - "data/n2_vocab_eng.txt", - "N2", - all_kanji, - )?); - vocab.extend(parse_jlpt_vocab_split( - "data/n1_vocab_hiragana.txt", - "data/n1_vocab_eng.txt", - "N1", - all_kanji, - )?); - for v in vocab.iter() { - println!("{}", v.to_string()); - } - Ok(()) -} - -fn parse_jlpt_vocab_combined( - file: &str, - level: &str, - all_kanji: &Charset, -) -> Result<Vec<JlptVocab>> { - let lines = jlpt_vocab_read_file(file)?; - let mut ret = vec![]; - for (kanji, answer) in lines { - let (eng, kana) = match answer.split_once('\n') { - Some((a, b)) => (a, b.trim()), - None => (answer.trim(), ""), - }; - for kanji in kanji.split('/') { - ret.push(JlptVocab { - level: level.to_string(), - chars: Charset::new(kanji).inter(all_kanji), - kanji: kanji.to_string(), - kana: kana.to_string(), - en: eng.to_string(), - }); - } - } - Ok(ret) -} - -fn parse_jlpt_vocab_split( - kana_file: &str, - eng_file: &str, - level: &str, - all_kanji: &Charset, -) -> Result<Vec<JlptVocab>> { - let eng_lines = jlpt_vocab_read_file(eng_file)? - .into_iter() - .collect::<HashMap<String, String>>(); - - let lines = jlpt_vocab_read_file(kana_file)?; - let mut ret = vec![]; - for (kanji, kana) in lines { - let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana)); - if let Some(eng) = eng { - for kanji in kanji.split('/') { - ret.push(JlptVocab { - level: level.to_string(), - chars: Charset::new(kanji).inter(all_kanji), - kanji: kanji.to_string(), - kana: kana.to_string(), - en: eng.to_string(), - }); - } - } - } - Ok(ret) -} - -fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> { - let re = regex::Regex::new(r#"<span class="\w+">"#)?; - - let file = fs::File::open(file)?; - let mut ret = vec![]; - for line in io::BufReader::new(file).lines() { - let line = line?.replace("<br>", "\n").replace("</span>", ""); - let line = re.replace_all(&line, ""); - if let Some((a, b)) = line.split_once('|') { - ret.push((a.trim().to_string(), b.trim().to_string())); - } - } - - Ok(ret) -} - -fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> { - let file = fs::File::open("data/jlpt_vocab.txt")?; - let mut ret = vec![]; - for line in io::BufReader::new(file).lines() { - let line = line?; - let line = line.splitn(5, "\t").collect::<Vec<_>>(); - if line.len() == 5 { - ret.push(JlptVocab { - level: line[0].to_string(), - chars: Charset::new(line[1]), - kanji: line[2].to_string(), - kana: line[3].to_string(), - en: line[4].to_string(), - }); - } - } - Ok(ret) -} - -// ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== const CHARS_PER_BATCH: usize = 20; const MAX_NEW_CHARS_PER_EX: usize = 5; -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -struct Example { - ja: String, - en: String, - expl: String, - id: Option<String>, - chars: Charset, -} - #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] struct Batch { level: String, @@ -1109,346 +800,3 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { ); } } - -// ===================================================================== -// FORMATTING TO HTML -// ===================================================================== - -fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { - format_batch_aux(dict_idx, count, i, batch).expect("format batch"); -} - -fn format_batch_aux<'a>( - dict_idx: &DictIndex<'a>, - count: usize, - i: usize, - batch: &Batch, -) -> Result<()> { - let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?); - write!( - f, - r#"<!DOCTYPE html> - <html> - <head> - <meta charset=\"UTF-8\" /> - <title>Batch #{:03}</title> - <link rel="stylesheet" type="text/css" href="style.css" /> - </head> - <body><div class="batch_page">"#, - i - )?; - - writeln!(f, r#"<p><a href="index.html">index</a>"#)?; - for j in 0..count { - if j != i { - writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?; - } else { - writeln!(f, " {:03}", j)?; - } - } - writeln!(f, r#"</p>"#)?; - writeln!(f, "<p>Level: {}</p>", batch.level)?; - - write!(f, r#"<p class="ja">"#)?; - let mut ex_prev = Charset::default(); - for ex in batch.examples.iter() { - let ex_chars = ex.chars.inter(&batch.chars); - for c in ex_chars.diff(&ex_prev).chars().iter() { - write!( - f, - r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, - c, c - )?; - } - ex_prev = ex_prev.union(&ex_chars); - } - writeln!(f, r#"</p>"#)?; - - for ex in batch.examples.iter() { - writeln!(f, "<hr />")?; - write!(f, r#"<p class="ja">"#)?; - for c in ex.ja.chars() { - if batch.chars.contains(c) { - write!(f, r#"<span class="char_cur">{}</span>"#, c)?; - } else if batch.chars_p1.contains(c) { - write!(f, r#"<span class="char_p1">{}</span>"#, c)?; - } else if batch.chars_p2.contains(c) { - write!(f, r#"<span class="char_p2">{}</span>"#, c)?; - } else if batch.chars_bad.contains(c) { - write!(f, r#"<span class="char_bad">{}</span>"#, c)?; - } else { - write!(f, "{}", c)?; - } - } - writeln!(f, "</p>")?; - writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?; - - writeln!(f, r#"<details><summary>Explanation</summary>"#)?; - let mut expl_batch = Vec::new(); - let mut expl_all = Vec::new(); - for word in ex.expl.split(|c| c == ' ' || c == '~') { - let (keb, reb) = expl_clean_word(word); - let wchars = Charset::new(keb); - if !wchars.intersects(&ex.chars) { - continue; - } - if let Some(ents) = dict_idx.get(keb) { - for ent in ents.iter() { - if let Some(s) = dict_str(keb, reb, ent) { - if wchars.intersects(&batch.chars) { - expl_batch.push(s); - } else { - expl_all.push(s); - } - } - } - } - } - for be in expl_batch { - writeln!(f, r#"<p>{}</p>"#, be)?; - } - writeln!(f, r#"<p class="chars">"#)?; - for c in ex.chars.inter(&batch.chars).chars().iter() { - writeln!( - f, - r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, - c, c - )?; - } - writeln!(f, r#"</p>"#)?; - for be in expl_all { - writeln!(f, r#"<p>{}</p>"#, be)?; - } - writeln!(f, r#"</details>"#)?; - } - - writeln!(f, "<hr />")?; - format_vocab( - &mut f, - &batch - .extra_vocab - .iter() - .filter(|v| batch.level.contains(&v.level)) - .collect::<Vec<_>>(), - "Extra vocabulary (this level)", - )?; - format_vocab( - &mut f, - &batch - .extra_vocab - .iter() - .filter(|v| !batch.level.contains(&v.level)) - .collect::<Vec<_>>(), - "Extra vocabulary (previous levels)", - )?; - - writeln!( - f, - r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"# - )?; - for ex in batch.extra_examples.iter() { - let mut expl1 = Vec::new(); - let mut expl2 = Vec::new(); - for word in ex.expl.split(|c| c == ' ' || c == '~') { - let (keb, reb) = expl_clean_word(word); - let wchars = Charset::new(keb); - if !wchars.intersects(&ex.chars) { - continue; - } - if let Some(ents) = dict_idx.get(keb) { - for ent in ents.iter() { - if let Some(s) = dict_str_short(keb, reb, ent) { - if wchars.intersects(&batch.chars) { - expl1.push(s); - } else { - expl2.push(s); - } - } - } - } - } - expl1.extend(expl2.into_iter()); - let expl = expl1.join("<br />"); - writeln!( - f, - r#"<tr><td><details><summary class="tab_large2 font_ja"> {} </summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#, - ex.ja, ex.en, expl - )?; - } - writeln!(f, r#"</table></details>"#)?; - - writeln!(f, "<hr />")?; - writeln!(f, "<p>\(≧▽≦)/</p>")?; - - write!(f, "<div></body></html>")?; - f.flush()?; - Ok(()) -} - -fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> { - if !vocab.is_empty() { - writeln!( - f, - r#"<details><summary>{}</summary><table class="vocabtable">"#, - t - )?; - for v in vocab { - writeln!( - f, - r#"<tr><td>{}</td><td> <span class="tab_large font_ja">{}</span> </td><td>{}</td><td class="font_ja">{}</td></tr>"#, - v.level, v.kanji, v.en, v.kana - )?; - } - writeln!(f, "</table></details>")?; - } - Ok(()) -} - -fn expl_clean_word(w: &str) -> (&str, Option<&str>) { - let mut ret = w; - for delim in ['(', '{', '['] { - if let Some((s, _)) = ret.split_once(delim) { - ret = s; - } - } - let p = w - .split_once('(') - .and_then(|(_, r)| r.split_once(')')) - .map(|(p, _)| p); - (ret, p) -} - -fn dict_str_short<'a>( - qkeb: &str, - qreb: Option<&str>, - ent: &roxmltree::Node<'a, 'a>, -) -> Option<String> { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - let reb = reb.text().unwrap().trim(); - - if qreb.map(|x| x != reb).unwrap_or(false) { - return None; - } - - Some(format!( - r#"<span class="font_ja">{} 【{}】</span>"#, - qkeb, reb - )) -} - -fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> { - let mut ret = dict_str_short(qkeb, qreb, ent)?; - - for sense in ent.children().filter(|x| x.has_tag_name("sense")) { - if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { - ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); - } - } - - if ret.chars().rev().next() == Some(';') { - ret.pop(); - } - Some(ret) -} - -fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { - let mut f = io::BufWriter::new(fs::File::create("public/index.html")?); - write!( - f, - r#"<!DOCTYPE html> - <html> - <head> - <meta charset=\"UTF-8\" /> - <title>List of batches</title> - <link rel="stylesheet" type="text/css" href="style.css" /> - </head> - <body><div class="index_page">"# - )?; - - writeln!(f, r#"<p><a href="about.html">About / How-to</a></p><hr />"#)?; - - writeln!(f, "<table>")?; - writeln!(f, "<tr><th>Num</th><th>Level</th><th>Kanji</th><th>Examples</th><th>Lesson-1</th><th>Lesson-2</th><th>Ignore</th></tr>")?; - for (i, batch) in batches.iter().enumerate() { - writeln!( - f, - r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td class="font_ja">{}</td><td> {}</td><td class="font_ja">{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td></tr>"#, - i, - i, - batch.level, - batch.chars.to_string(), - batch.examples.len(), - batch.chars_p1.to_string(), - batch.chars_p2.to_string(), - batch.chars_bad.to_string() - )?; - } - writeln!(f, r#"</table>"#)?; - - writeln!(f, "<hr />")?; - - let all_chars = Charset::from_iter( - batches - .iter() - .map(|x| x.chars.chars().iter().copied()) - .flatten(), - ); - writeln!(f, "<table>")?; - writeln!( - f, - r#"<tr><th>Level</th><th>Count</th><th width="60%">Kanji</th><th>Missing kanji</th></tr>"# - )?; - for (lvl, chars) in kanji_levels.iter() { - if lvl == "N0+" || lvl.ends_with("-10") { - continue; - } - let chars = Charset::new(chars); - let missing = chars.diff(&all_chars); - writeln!( - f, - r#"<tr><td>{}</td><td>{}</td><td class="font_ja">{}</td><td><span class="font_ja">{}</span> ({})</td></tr>"#, - lvl, - chars.len(), - chars.to_string(), - missing.to_string(), - missing.len() - )?; - } - writeln!(f, "</table>")?; - - write!(f, "</div></body></html>")?; - f.flush()?; - Ok(()) -} - -fn format_about() -> Result<()> { - let mut f = io::BufWriter::new(fs::File::create("public/about.html")?); - write!( - f, - r#"<!DOCTYPE html> - <html> - <head> - <meta charset=\"UTF-8\" /> - <title>Datagengo README</title> - <link rel="stylesheet" type="text/css" href="style.css" /> - </head> - <body>"# - )?; - - writeln!(f, r#"<div class="about_page">"#)?; - writeln!( - f, - r#"<p><a href="index.html">Back to lessons</a></p><hr />"# - )?; - - writeln!( - f, - "{}", - markdown::to_html(&fs::read_to_string("README.md")?) - )?; - - writeln!(f, r#"</div></body></html>"#)?; - - Ok(()) -} |