use std::collections::HashMap; use std::fs; use std::io::{self, BufRead, Write}; use anyhow::{anyhow, Result}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use structopt::StructOpt; mod charset; use charset::Charset; #[derive(Debug, StructOpt)] #[structopt(name = "datagengo", about = "Japanese example practice maker")] struct Opt { #[structopt(subcommand)] cmd: Cmd, } #[derive(Debug, StructOpt)] enum Cmd { ParseKanjidic, New { #[structopt(default_value = "10")] count: usize, }, Simplify, Rebalance { #[structopt(default_value = "5")] start: usize, }, Format, } fn main() { let opt = Opt::from_args(); match opt.cmd { Cmd::ParseKanjidic => { let levels = parse_kanjidic().expect("error"); for (level, chars) in levels.iter() { println!("{}: {}", level, chars.to_string()); } } Cmd::New { count } => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( kanji_levels .iter() .map(|(_, x)| x.to_string()) .collect::>() .join(""), ); let kanji_levels = kanji_levels .into_iter() .map(|(l, x)| (l, Charset::new(x))) .collect::>(); let mut ex = read_examples(&all_kanji).expect("read_examples"); ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .unwrap_or_default(); println!("---- starting after {} batches ----", batches.len()); for _ in 0..count { let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch"); if batch.examples.is_empty() { break; } batches.push(batch); } fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::Simplify => { let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .unwrap_or_default(); for batch in batches.iter_mut() { simplify_batch(batch); } fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::Rebalance { start } => { let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .unwrap_or_default(); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); for (level, _) in kanji_levels.iter() { rebalance_level(level, &mut batches[start..]); } fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( &jmdict, roxmltree::ParsingOptions { allow_dtd: true, ..Default::default() }, ) .expect("parse_jmdict"); let jmdict_idx = index_jmdict(&jmdict); let batches = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice::>(&x)?)) .expect("read/parse"); fs::create_dir_all("public").expect("mkdir public"); fs::copy("static/style.css", "public/style.css").expect("copy style.css"); batches .iter() .enumerate() .for_each(|x| format_batch(&jmdict_idx, batches.len(), x)); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); format_index(&batches, &kanji_levels).expect("format_index"); } } } // ===================================================================== // PARSING DATA FILES // ===================================================================== type DictIndex<'a> = HashMap<&'a str, Vec>>; fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { let dict = dict .root() .children() .find(|x| x.has_tag_name("JMdict")) .unwrap(); let mut ret: DictIndex<'a> = HashMap::new(); for x in dict.children().filter(|x| x.has_tag_name("entry")) { for r in x.children().filter(|x| x.has_tag_name("k_ele")) { if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { let txt = keb.text().unwrap().trim(); ret.entry(txt).or_default().push(x); } } } ret } fn parse_kanjidic() -> Result> { let file = fs::read_to_string("data/kanjidic2.xml")?; let xml = roxmltree::Document::parse(&file)?; let kanjidic = xml.root().first_child().unwrap(); assert!(kanjidic.has_tag_name("kanjidic2")); let mut levels = HashMap::new(); for x in kanjidic.children() { if !x.has_tag_name("character") { continue; } let mut literal = None; let mut jlpt = None; let mut grade = None; for y in x.children() { if y.has_tag_name("literal") { literal = y.text(); } if y.has_tag_name("misc") { for z in y.children() { if z.has_tag_name("jlpt") { jlpt = z.text().and_then(|x| str::parse::(x).ok()); } if z.has_tag_name("grade") { grade = z.text().and_then(|x| str::parse::(x).ok()); } } } } if let Some(lit) = literal { levels .entry((jlpt, grade)) .or_insert(String::new()) .extend(lit.chars()); } } let mut levels = levels.into_iter().collect::>(); levels.sort_by_key(|((j, g), _)| match (j, g) { (Some(j), Some(g)) => (10 - *j) * 20 + *g, (Some(j), None) => (10 - *j) * 20 + 15, (None, Some(g)) => 1000 + *g, (None, None) => 1015, }); let mut ret = Vec::new(); let mut pc = Charset::default(); for ((j, g), chars) in levels.into_iter() { let name = match (j, g) { (Some(j), Some(g)) => format!("N{}-{}", j, g), (Some(j), None) => format!("N{}+", j), (None, Some(g)) => format!("N0-{}", g), (None, None) => format!("N0+"), }; let chars = Charset::new(chars).diff(&pc); pc = pc.union(&chars); ret.push((name, chars)); } Ok(ret) } fn read_kanji_levels() -> Result> { Ok(fs::read_to_string("data/kanji_levels.txt")? .lines() .filter_map(|l| l.split_once(": ")) .map(|(l, k)| (l.to_string(), k.to_string())) .collect::>()) } fn read_examples(all_kanji: &Charset) -> Result> { let file = fs::File::open("data/examples.utf")?; let mut ret = Vec::new(); let mut a = "".to_string(); for (i, line) in io::BufReader::new(file).lines().enumerate() { let line = line?; if line.starts_with("A:") { a = line; } else if line.starts_with("B:") { let s = a.strip_prefix("A: "); let t = line.strip_prefix("B: "); if let (Some(a), Some(b)) = (s, t) { if let Some((ja, eng)) = a.split_once("\t") { if let Some((eng, id)) = eng.split_once("#") { ret.push(Example { ja: ja.to_string(), en: eng.to_string(), expl: b.to_string(), id: Some(id.to_string()), chars: Charset::new(ja).inter(all_kanji), }); } else { ret.push(Example { ja: ja.to_string(), en: eng.to_string(), expl: b.to_string(), id: None, chars: Charset::new(ja).inter(all_kanji), }); } } } } if i % 10000 == 0 { eprintln!("read examples: {}/300", i / 1000); } } Ok(ret) } // ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== #[derive(Debug, Clone, Serialize, Deserialize)] struct Example { ja: String, en: String, expl: String, id: Option, chars: Charset, } #[derive(Debug, Clone, Serialize, Deserialize)] struct Batch { level: String, chars: Charset, chars_p1: Charset, chars_p2: Charset, chars_bad: Charset, examples: Vec, } fn gen_batch( previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &[Example], ) -> Result { let prev_chars = Charset::from_iter( previous .iter() .map(|x| x.chars.chars().iter().copied()) .flatten(), ); let (mut target_i, target_level, mut target_chars) = kanji_levels .iter() .enumerate() .map(|(i, (l, c))| (i, l, c.diff(&prev_chars))) .find(|(_, _, c)| !c.is_empty()) .ok_or(anyhow!("no more batches to make!"))?; let chars_p1 = previous .iter() .rev() .next() .map(|b| b.chars.clone()) .unwrap_or(Charset::default()); let chars_p2 = previous .iter() .rev() .skip(1) .next() .map(|b| b.chars.clone()) .unwrap_or(Charset::default()); let mut chars_late = Charset::default(); let mut chars_bad = Charset::from_iter( kanji_levels .iter() .skip(target_i + 1) .map(|(_, c)| c.chars().iter().copied()) .flatten(), ); let mut chars_bad_avoid = Charset::from_iter( kanji_levels .iter() .skip(target_i + 1) .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) .map(|(_, c)| c.chars().iter().copied()) .flatten(), ); let mut batch = Batch { level: target_level.to_string(), chars: Charset::default(), chars_p1: Charset::default(), chars_p2: Charset::default(), chars_bad: Charset::default(), examples: Vec::new(), }; let mut batch_chars = Charset::default(); eprintln!("----"); eprintln!("Level : {}", batch.level); eprintln!("Target : {}", target_chars.to_string()); eprintln!("Prev1 : {}", chars_p1.to_string()); eprintln!("Prev2 : {}", chars_p2.to_string()); eprintln!("Bad : {} characters", chars_bad.len()); let batch_len = 20; let mut stalled = false; while batch.chars.len() < batch_len && !target_chars.is_empty() { let need = batch_len - batch.chars.len(); let should_add = need > target_chars.len() && target_chars.len() <= 3; if target_i + 1 < kanji_levels.len() && (should_add || stalled) { // upgrade to next level target_i += 1; chars_late = chars_late.union(&target_chars); target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars)); chars_bad = chars_bad.diff(&target_chars); chars_bad_avoid = chars_bad_avoid.diff(&target_chars); if batch.examples.is_empty() { batch.level = kanji_levels[target_i].0.to_string(); } else { batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0); } eprintln!("Level : {}", batch.level); eprintln!("Target: {}", target_chars.to_string()); eprintln!("Late : {}", chars_late.to_string()); eprintln!("Bad : {} characters", chars_bad.len()); stalled = false; } /* this one works well enough let cost = |ex: &Example, ex_tgt_inter: usize| { 20i32 * ex_tgt_inter as i32 + 30i32 * ex.chars.inter_len(&chars_late) as i32 + 6i32 * ex.chars.inter_len(&batch.chars) as i32 + 4i32 * ex.chars.inter_len(&chars_p1) as i32 + 3i32 * ex.chars.inter_len(&chars_p2) as i32 - 40i32 * ex.chars.inter_len(&chars_bad) as i32 }; */ let cost = |ex: &Example, ex_tgt_inter: usize| { ( -(ex.chars.inter_len(&chars_bad_avoid) as i32), ex_tgt_inter, ex.chars.inter_len(&chars_late), 2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2), -(ex.ja.chars().count() as i32), ) }; let cand_1 = examples .par_iter() .map(|ex| (ex, ex.chars.inter_len(&target_chars))) .filter(|(_, ex_tgt_inter)| { (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len }) .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)); let cand = cand_1.or_else(|| { examples .par_iter() .map(|ex| (ex, ex.chars.inter_len(&target_chars))) .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0) .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)) }); if let Some((ex, _)) = cand { eprintln!( "* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}", ex.chars.inter(&target_chars).to_string(), ex.chars.inter(&batch.chars).to_string(), ex.chars.inter(&chars_p1).to_string(), ex.chars.inter(&chars_p2).to_string(), ex.chars.inter(&chars_bad).to_string(), ex.ja ); batch.chars = batch.chars.union(&ex.chars.inter(&target_chars)); target_chars = target_chars.diff(&ex.chars); chars_late = chars_late.diff(&ex.chars); batch.examples.push(ex.clone()); batch_chars = batch_chars.union(&ex.chars); stalled = false; } else { if stalled { eprintln!( "could not find suitable sentence, stopping batch now (need {})", need ); break; } stalled = true; } } batch.chars_p1 = chars_p1.inter(&batch_chars); batch.chars_p2 = chars_p2.inter(&batch_chars); batch.chars_bad = chars_bad.inter(&batch_chars); Ok(batch) } fn simplify_batch(batch: &mut Batch) { let mut char_cnt = HashMap::::new(); for ex in batch.examples.iter() { for ch in batch.chars.inter(&ex.chars).chars() { *char_cnt.entry(*ch).or_default() += 1; } } loop { let i_opt = batch.examples.iter().position(|ex| { batch .chars .inter(&ex.chars) .chars() .iter() .all(|x| char_cnt[x] >= 2) }); if let Some(i) = i_opt { println!( "Removing {} [{}]", batch.examples[i].ja, batch.examples[i].chars.to_string() ); batch.examples.remove(i); } else { break; } } } fn rebalance_level(level: &str, batches: &mut [Batch]) { let mut i_batch = vec![]; let mut n_ex = 0; for (i, b) in batches.iter().enumerate() { if b.level == level { i_batch.push(i); n_ex += b.examples.len(); } } if i_batch.len() < 2 { return; } println!("Level {}: {} batches, {} examples, avg {:.2}", level, i_batch.len(), n_ex, n_ex as f32 / i_batch.len() as f32); todo!() } // ===================================================================== // FORMATTING TO HTML // ===================================================================== fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { format_batch_aux(dict_idx, count, i, batch).expect("format batch"); } fn format_batch_aux<'a>( dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch, ) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?); write!( f, r#" Batch #{:03} "#, i )?; writeln!(f, r#"

index"#)?; for j in 0..count { if j != i { writeln!(f, r#" {:03}"#, j, j)?; } else { writeln!(f, " {:03}", j)?; } } writeln!(f, r#"

"#)?; writeln!(f, "

Level: {}

", batch.level)?; writeln!( f, r#"

【{}】

"#, batch.chars.to_string() )?; for ex in batch.examples.iter() { writeln!(f, "
")?; write!(f, r#"

"#)?; for c in ex.ja.chars() { if batch.chars.contains(c) { write!(f, r#"{}"#, c)?; } else if batch.chars_p1.contains(c) { write!(f, r#"{}"#, c)?; } else if batch.chars_p2.contains(c) { write!(f, r#"{}"#, c)?; } else if batch.chars_bad.contains(c) { write!(f, r#"{}"#, c)?; } else { write!(f, "{}", c)?; } } writeln!(f, "

")?; writeln!(f, r#"

{}

"#, ex.en)?; writeln!(f, r#"
Explanation"#)?; let mut expl_batch = Vec::new(); let mut expl_all = Vec::new(); for word in ex.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); let wchars = Charset::new(keb); if !wchars.intersects(&ex.chars) { continue; } if let Some(ents) = dict_idx.get(keb) { for ent in ents.iter() { if let Some(s) = dict_str(keb, reb, ent) { if wchars.intersects(&batch.chars) { expl_batch.push(s); } else { expl_all.push(s); } } } } } for be in expl_batch { writeln!(f, r#"

{}

"#, be)?; } writeln!(f, r#"

"#)?; for c in ex.chars.inter(&batch.chars).chars().iter() { writeln!( f, r#"{}"#, c, c )?; } writeln!(f, r#"

"#)?; for be in expl_all { writeln!(f, r#"

{}

"#, be)?; } writeln!(f, r#"
"#)?; } write!(f, "")?; f.flush()?; Ok(()) } fn expl_clean_word(w: &str) -> (&str, Option<&str>) { let mut ret = w; for delim in ['(', '{', '['] { if let Some((s, _)) = ret.split_once(delim) { ret = s; } } let p = w .split_once('(') .and_then(|(_, r)| r.split_once(')')) .map(|(p, _)| p); (ret, p) } fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option { let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); let reb = reb.text().unwrap().trim(); if qreb.map(|x| x != reb).unwrap_or(false) { return None; } let mut ret = format!("{} [{}]", qkeb, reb); for sense in ent.children().filter(|x| x.has_tag_name("sense")) { if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); } } if ret.chars().rev().next() == Some(';') { ret.pop(); } Some(ret) } fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create("public/index.html")?); write!( f, r#" List of batches "# )?; writeln!(f, "")?; writeln!(f, "")?; for (i, batch) in batches.iter().enumerate() { writeln!( f, r#""#, i, i, batch.level, batch.chars.to_string(), batch.examples.len(), batch.chars_p1.union(&batch.chars_p2).to_string(), batch.chars_bad.to_string() )?; } writeln!(f, r#"
NumLevelCharsExamplesReviewIgnore
{:03}{}{}  {}{}{}
"#)?; writeln!(f, "
")?; let all_chars = Charset::from_iter( batches .iter() .map(|x| x.chars.chars().iter().copied()) .flatten(), ); writeln!(f, "")?; writeln!( f, "" )?; for (lvl, chars) in kanji_levels.iter() { if lvl == "N0+" || lvl == "N0-9" || lvl.ends_with("-10") { continue; } let chars = Charset::new(chars); let missing = chars.diff(&all_chars); writeln!( f, "", lvl, chars.len(), chars.to_string(), missing.to_string(), missing.len() )?; } writeln!(f, "
LevelCountCharsMissing chars
{}{}{}{} ({})
")?; write!(f, "")?; f.flush()?; Ok(()) }