use std::collections::HashMap; use std::fs; use std::io::{self, BufRead, Write}; use anyhow::Result; use rand::prelude::*; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use structopt::StructOpt; mod charset; use charset::Charset; #[derive(Debug, StructOpt)] #[structopt(name = "datagengo", about = "Japanese example practice maker")] struct Opt { #[structopt(subcommand)] cmd: Cmd, } #[derive(Debug, StructOpt)] enum Cmd { ParseKanjidic, ParseJlptVocab, New { #[structopt(default_value = "10")] count: usize, #[structopt(long = "truncate")] truncate: Option, }, Simplify, Cleanup, AddVocab, AddExamples, Format, } fn main() { let opt = Opt::from_args(); match opt.cmd { Cmd::ParseKanjidic => { let levels = parse_kanjidic().expect("error"); for (level, chars) in levels.iter() { println!("{}: {}", level, chars.to_string()); } } Cmd::ParseJlptVocab => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::from_iter(kanji_levels.iter().map(|(_, c)| c.chars()).flatten()); parse_jlpt_vocab(&all_kanji).expect("error"); } Cmd::New { truncate, count } => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( kanji_levels .iter() .map(|(_, x)| x.to_string()) .collect::>() .join(""), ); let kanji_levels = kanji_levels .into_iter() .map(|(l, x)| (l, Charset::new(x))) .collect::>(); let mut ex = read_examples(&all_kanji).expect("read_examples"); ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .unwrap_or_default(); if let Some(t) = truncate { batches.truncate(t); } println!("---- starting after {} batches ----", batches.len()); let target_len = batches.len() + count; gen_batches(&mut batches, target_len, &kanji_levels, &ex); fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::Simplify => { let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .expect("failed to decode batches.json"); for batch in batches.iter_mut() { simplify_batch(batch); } fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::Cleanup => { let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .expect("failed to decode batches.json"); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let kanji_levels = kanji_levels .into_iter() .map(|(l, x)| (l, Charset::new(x))) .collect::>(); cleanup_batches(&mut batches, &kanji_levels); fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::AddVocab => { let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .expect("failed to decode batches.json"); let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab"); add_vocab(&mut batches, &jlpt_vocab); fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::AddExamples => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( kanji_levels .iter() .map(|(_, x)| x.to_string()) .collect::>() .join(""), ); let mut ex = read_examples(&all_kanji).expect("read_examples"); ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); let mut batches: Vec = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .expect("failed to decode batches.json"); add_extra_examples(&mut batches, &ex); fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) .expect("serialize") .as_bytes(), ) .expect("save"); } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( &jmdict, roxmltree::ParsingOptions { allow_dtd: true, ..Default::default() }, ) .expect("parse_jmdict"); let jmdict_idx = index_jmdict(&jmdict); let batches = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice::>(&x)?)) .expect("read/parse"); fs::create_dir_all("public").expect("mkdir public"); fs::copy("static/style.css", "public/style.css").expect("copy style.css"); batches .iter() .enumerate() .for_each(|x| format_batch(&jmdict_idx, batches.len(), x)); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); format_index(&batches, &kanji_levels).expect("format_index"); format_about().expect("format_about"); } } } // ===================================================================== // PARSING DATA FILES // ===================================================================== type DictIndex<'a> = HashMap<&'a str, Vec>>; fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { let dict = dict .root() .children() .find(|x| x.has_tag_name("JMdict")) .unwrap(); let mut ret: DictIndex<'a> = HashMap::new(); for x in dict.children().filter(|x| x.has_tag_name("entry")) { for r in x.children().filter(|x| x.has_tag_name("k_ele")) { if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { let txt = keb.text().unwrap().trim(); ret.entry(txt).or_default().push(x); } } } ret } fn parse_kanjidic() -> Result> { let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); let file = fs::read_to_string("data/kanjidic2.xml")?; let xml = roxmltree::Document::parse(&file)?; let kanjidic = xml.root().first_child().unwrap(); assert!(kanjidic.has_tag_name("kanjidic2")); let mut levels = HashMap::new(); for x in kanjidic.children() { if !x.has_tag_name("character") { continue; } let mut literal = None; let mut jlpt = None; let mut grade = None; for y in x.children() { if y.has_tag_name("literal") { literal = y.text(); } if y.has_tag_name("misc") { for z in y.children() { if z.has_tag_name("jlpt") { jlpt = z.text().and_then(|x| str::parse::(x).ok()); } if z.has_tag_name("grade") { grade = z.text().and_then(|x| str::parse::(x).ok()); } } } } match grade { Some(i) if i <= 6 => grade = Some(7), _ => (), } if let Some(lit) = literal { assert_eq!(lit.chars().count(), 1); let jlpt = match jlpt { Some(4) => Some(5), Some(3) => Some(4), Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3), x => x, }; levels .entry((jlpt, grade)) .or_insert(String::new()) .extend(lit.chars()); } } let mut levels = levels.into_iter().collect::>(); levels.sort_by_key(|((j, g), _)| match (j, g) { (Some(j), Some(g)) => (10 - *j) * 20 + *g, (Some(j), None) => (10 - *j) * 20 + 15, (None, Some(g)) => 1000 + *g, (None, None) => 1015, }); let mut ret = Vec::new(); let mut pc = Charset::default(); for ((j, g), chars) in levels.into_iter() { let name = match (j, g) { (Some(j), Some(7)) => format!("N{}a", j), (Some(j), Some(8)) => format!("N{}b", j), (Some(j), Some(g)) => format!("N{}-{}", j, g), (Some(j), None) => format!("N{}+", j), (None, Some(7)) => format!("N0a"), (None, Some(8)) => format!("N0b"), (None, Some(g)) => format!("N0-{}", g), (None, None) => format!("N0+"), }; let chars = Charset::new(chars).diff(&pc); pc = pc.union(&chars); ret.push((name, chars)); } Ok(ret) } fn read_kanji_levels() -> Result> { Ok(fs::read_to_string("data/kanji_levels.txt")? .lines() .filter_map(|l| l.split_once(": ")) .map(|(l, k)| (l.to_string(), k.to_string())) .collect::>()) } fn read_examples(all_kanji: &Charset) -> Result> { let file = fs::File::open("data/examples.utf")?; let mut ret = Vec::new(); let mut a = "".to_string(); for (i, line) in io::BufReader::new(file).lines().enumerate() { let line = line?; if line.starts_with("A:") { a = line; } else if line.starts_with("B:") { let s = a.strip_prefix("A: "); let t = line.strip_prefix("B: "); if let (Some(a), Some(b)) = (s, t) { if let Some((ja, eng)) = a.split_once("\t") { if let Some((eng, id)) = eng.split_once("#") { ret.push(Example { ja: ja.to_string(), en: eng.to_string(), expl: b.to_string(), id: Some(id.to_string()), chars: Charset::new(ja).inter(all_kanji), }); } else { ret.push(Example { ja: ja.to_string(), en: eng.to_string(), expl: b.to_string(), id: None, chars: Charset::new(ja).inter(all_kanji), }); } } } } if i % 10000 == 0 { eprintln!("read examples: {}/300 (x1000)", i / 1000); } } Ok(ret) } #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] struct JlptVocab { level: String, chars: Charset, kanji: String, kana: String, en: String, } impl JlptVocab { fn to_string(&self) -> String { format!( "{}\t{}\t{}\t{}\t{}", self.level, self.chars.to_string(), self.kanji, self.kana, self.en ) } } fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { let mut vocab = vec![]; vocab.extend(parse_jlpt_vocab_combined( "data/n5_vocab.txt", "N5", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n4_vocab_hiragana.txt", "data/n4_vocab_eng.txt", "N4", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n3_vocab_hiragana.txt", "data/n3_vocab_eng.txt", "N3", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n2_vocab_hiragana.txt", "data/n2_vocab_eng.txt", "N2", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n1_vocab_hiragana.txt", "data/n1_vocab_eng.txt", "N1", all_kanji, )?); for v in vocab.iter() { println!("{}", v.to_string()); } Ok(()) } fn parse_jlpt_vocab_combined( file: &str, level: &str, all_kanji: &Charset, ) -> Result> { let lines = jlpt_vocab_read_file(file)?; let mut ret = vec![]; for (kanji, answer) in lines { let (eng, kana) = match answer.split_once('\n') { Some((a, b)) => (a, b.trim()), None => (answer.trim(), ""), }; for kanji in kanji.split('/') { ret.push(JlptVocab { level: level.to_string(), chars: Charset::new(kanji).inter(all_kanji), kanji: kanji.to_string(), kana: kana.to_string(), en: eng.to_string(), }); } } Ok(ret) } fn parse_jlpt_vocab_split( kana_file: &str, eng_file: &str, level: &str, all_kanji: &Charset, ) -> Result> { let eng_lines = jlpt_vocab_read_file(eng_file)? .into_iter() .collect::>(); let lines = jlpt_vocab_read_file(kana_file)?; let mut ret = vec![]; for (kanji, kana) in lines { let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana)); if let Some(eng) = eng { for kanji in kanji.split('/') { ret.push(JlptVocab { level: level.to_string(), chars: Charset::new(kanji).inter(all_kanji), kanji: kanji.to_string(), kana: kana.to_string(), en: eng.to_string(), }); } } } Ok(ret) } fn jlpt_vocab_read_file(file: &str) -> Result> { let re = regex::Regex::new(r#""#)?; let file = fs::File::open(file)?; let mut ret = vec![]; for line in io::BufReader::new(file).lines() { let line = line?.replace("
", "\n").replace("
", ""); let line = re.replace_all(&line, ""); if let Some((a, b)) = line.split_once('|') { ret.push((a.trim().to_string(), b.trim().to_string())); } } Ok(ret) } fn load_jlpt_vocab() -> Result> { let file = fs::File::open("data/jlpt_vocab.txt")?; let mut ret = vec![]; for line in io::BufReader::new(file).lines() { let line = line?; let line = line.splitn(5, "\t").collect::>(); if line.len() == 5 { ret.push(JlptVocab { level: line[0].to_string(), chars: Charset::new(line[1]), kanji: line[2].to_string(), kana: line[3].to_string(), en: line[4].to_string(), }); } } Ok(ret) } // ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== const CHARS_PER_BATCH: usize = 20; const MAX_NEW_CHARS_PER_EX: usize = 5; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] struct Example { ja: String, en: String, expl: String, id: Option, chars: Charset, } #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] struct Batch { level: String, chars: Charset, chars_p1: Charset, chars_p2: Charset, chars_bad: Charset, examples: Vec, #[serde(default)] extra_vocab: Vec, #[serde(default)] extra_examples: Vec, } fn gen_batches( batches: &mut Vec, target_len: usize, kanji_levels: &[(String, Charset)], examples: &[Example], ) { let mut remainder = None; while batches.len() < target_len { let done = Charset::from_iter( batches .iter() .map(|x| x.chars.chars().iter().copied()) .flatten(), ); let remainder_chars = remainder .as_ref() .map(|x: &Batch| x.chars.clone()) .unwrap_or_default(); let remainder_before = remainder.clone(); let len_before = batches.len(); let mut advanced = false; for (i, (level, level_chars)) in kanji_levels.iter().enumerate() { let diff = level_chars.diff(&done).diff(&remainder_chars); if !diff.is_empty() { let avoid = Charset::from_iter( kanji_levels .iter() .skip(i + 1) .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) .map(|(_, c)| c.chars().iter().copied()) .flatten(), ); let level_examples = level_examples(&diff, &avoid, examples); let level_new_chars = Charset::from_iter( level_examples .iter() .map(|x| x.chars.chars().iter().copied()) .flatten(), ) .inter(&diff); println!( "- {} ({} chars): {} done previously, {} diff, {} ex, {} new chars", level, level_chars.len(), done.len(), diff.len(), level_examples.len(), level_new_chars.len() ); if !level_examples.is_empty() { assert!(!level_new_chars.is_empty()); remainder = gen_level( batches, level, &level_new_chars, &done, level_examples, remainder, ); advanced = true; break; } } } if let Some(r) = &remainder { assert!(r.examples.len() <= 20); } if advanced && batches.len() == len_before && remainder == remainder_before { // restart level with new rng let last_level = batches.last().unwrap().level.to_string(); println!("RESTARTING LEVEL {}, hopefully new RNG", last_level); while batches .last() .map(|x| x.level == last_level) .unwrap_or(false) { batches.pop(); remainder = None; } } else if !advanced { break; } } if let Some(r) = remainder { if batches.len() < target_len { batches.push(r); } } } fn gen_level( batches: &mut Vec, level: &str, new_chars: &Charset, prev_done: &Charset, mut examples: Vec<&Example>, mut remainder: Option, ) -> Option { examples.shuffle(&mut thread_rng()); let remainder_chars = remainder.as_ref().map(|x| x.chars.len()).unwrap_or(0); println!( "Level {}: {} characters using {} examples, remainder has {} chars and {} examples", level, new_chars.len(), examples.len(), remainder_chars, remainder.as_ref().map(|x| x.examples.len()).unwrap_or(0), ); let avg_len = examples.len() as f32 * CHARS_PER_BATCH as f32 / new_chars.len() as f32; let mut batch_count = 0; let mut sum_len = 0; let mut done = prev_done.union( remainder .as_ref() .map(|x| &x.chars) .unwrap_or(&Charset::default()), ); loop { println!("iter with {} examples", examples.len()); let mut batch = remainder.take().unwrap_or_else(|| Batch { level: level.to_string(), ..Default::default() }); let remaining_chars = new_chars.diff(&done); let todo_chars = CHARS_PER_BATCH - batch.chars.len(); if remaining_chars.len() <= todo_chars { for ex in examples.iter() { batch.examples.push((*ex).clone()); batch.chars = batch.chars.union(&ex.chars.diff(&done).inter(&new_chars)); } if batch.chars.len() == CHARS_PER_BATCH { println!( "-> all remaining examples sum up to exaclty {} chars", CHARS_PER_BATCH ); batches.push(batch); return None; } else if batch.examples.is_empty() { assert!(batch.chars.is_empty()); println!("-> done"); return None; } else { assert!(batch.chars.len() < CHARS_PER_BATCH); println!( "-> with all remaining examples, cannot make a full batch, only {} chars", batch.chars.len() ); return Some(batch); } } assert!(!examples.is_empty()); println!( "Trying to add exactly {} characters, using {} examples containing {} new chars", todo_chars, examples.len(), remaining_chars.len() ); // Compute dynamic algorithm matrix with a bunch of combinations that add `todo_chars` let mut dyn_mat: Vec)>>> = vec![]; for ex in examples.iter() { let mut dyn_row = vec![None; todo_chars + 1]; let chars_common = ex.chars.inter(&new_chars).diff(&done); if chars_common.len() > MAX_NEW_CHARS_PER_EX { dyn_mat.push(dyn_row); continue; } if chars_common.len() < dyn_row.len() { dyn_row[chars_common.len()] = Some((chars_common.clone(), None)); } for (i, dyn_prev) in dyn_mat.iter().enumerate() { for (j, dpr) in dyn_prev.iter().enumerate() { if let Some((chars_inter, _prev)) = dpr { assert_eq!(chars_inter.len(), j); let new_chars_common = chars_inter.union(&chars_common); let new_chars_common_len = new_chars_common.len(); if new_chars_common_len > chars_inter.len() && new_chars_common_len <= todo_chars { dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j)))); } } } } dyn_mat.push(dyn_row); } // Find combination that does that with a good number of examples (tgt_len) let tgt_len = (avg_len * (batch_count as f32 + 1.)).ceil() as i64 - (sum_len + batch.examples.len()) as i64; let dyn_mat_cnt = |i| { let mut cnt = 0; let mut i: usize = i; let mut j: usize = todo_chars; loop { match &dyn_mat[i][j] { None => return None, Some((_, ij_prev)) => { cnt += 1; match ij_prev { Some((iprev, jprev)) => { i = *iprev; j = *jprev; } None => return Some(cnt), } } } } }; let i_opt = (0..dyn_mat.len()) .filter_map(|pos| dyn_mat_cnt(pos).map(|cnt| (pos, cnt))) .min_by_key(|(_, cnt)| { let x = *cnt as i64 - tgt_len; x * x }); let i = match i_opt { None => { println!( "WARNING: cannot make exactly {} chars, interrupting", todo_chars ); return None; } Some((pos, _)) => pos, }; // Take all examples from that combination and add them to current batch let (mut i, mut j) = (i, todo_chars); loop { match &dyn_mat[i][j] { None => panic!("dyn_mat[{}][{}] == None", i, j), Some((chars, ij_prev)) => { println!( "Add {}: {}", examples[i].chars.inter(&chars).to_string(), examples[i].ja ); batch.examples.push(examples[i].clone()); examples.remove(i); batch.chars = batch.chars.union(&chars); match ij_prev { Some((iprev, jprev)) => { assert!(*iprev < i); i = *iprev; j = *jprev; } None => break, } } } } assert_eq!(batch.chars.len(), CHARS_PER_BATCH); println!( "-> batch {:03}: {} with {} examples", batches.len(), batch.chars.to_string(), batch.examples.len() ); batch_count += 1; done = done.union(&batch.chars); sum_len += batch.examples.len(); batches.push(batch); } } fn level_examples<'a>( chars: &Charset, avoid: &Charset, all_examples: &'a [Example], ) -> Vec<&'a Example> { println!("Calculating examples for {}", chars.to_string()); let mut todo = chars.clone(); let mut bad = Charset::default(); let mut examples = vec![]; let cost = |ex: &Example, ex_todo_inter: usize, ex_chars_inter: usize| { ( -(ex.chars.inter_len(&avoid) as i32), ex_todo_inter, ex_chars_inter, -(ex.ja.chars().count() as i32), ex.chars.len() + thread_rng().gen_range(0..5), ) }; let mut all_with_inter = all_examples .par_iter() .map(|ex| (ex, ex.chars.inter_len(&chars))) .map(|(ex, ex_chars_inter)| (ex, ex_chars_inter, ex_chars_inter)) .collect::>(); while !todo.is_empty() { let best = all_with_inter .par_iter() .enumerate() .filter(|(_, (_, ex_todo_inter, _))| *ex_todo_inter > 0) //.filter(|(_, (_, _, ex_tgt_inter))| (1..=8).contains(ex_tgt_inter)) .max_by_key(|(_, (ex, ex_todo_inter, ex_chars_inter))| { cost(*ex, *ex_todo_inter, *ex_chars_inter) }); if let Some((i, (ex, ex_todo_inter, _))) = best { let ex = *ex; assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len()); examples.push(ex); all_with_inter.remove(i); todo = todo.diff(&ex.chars); bad = bad.union(&ex.chars.inter(&avoid)); all_with_inter .par_iter_mut() .for_each(|(ex2, ex_todo_inter, _)| { if ex2.chars.inter_len(&ex.chars) > 0 { *ex_todo_inter = ex2.chars.inter_len(&todo); } }); } else { break; } } if !todo.is_empty() { println!("MISSING: NO SENTENCES FOR {}", todo.to_string()); } if !bad.is_empty() { println!("USED BAD CHARS: {}", bad.to_string()); } examples } fn simplify_batch(batch: &mut Batch) { let mut char_cnt = HashMap::::new(); for ex in batch.examples.iter() { for ch in batch.chars.inter(&ex.chars).chars() { *char_cnt.entry(*ch).or_default() += 1; } } loop { let i_opt = batch.examples.iter().position(|ex| { batch .chars .inter(&ex.chars) .chars() .iter() .all(|x| char_cnt[x] >= 2) }); if let Some(i) = i_opt { println!( "Removing {} [{}]", batch.examples[i].ja, batch.examples[i].chars.to_string() ); batch.examples.remove(i); } else { break; } } } fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]) { let mut chars_p1 = Charset::default(); let mut chars_p2 = Charset::default(); let mut done = Charset::default(); for batch in all_batches.iter_mut() { let all_chars = Charset::from_iter( batch .examples .iter() .map(|x| x.chars.chars().iter().copied()) .flatten(), ); let mut levels = kanji_levels .iter() .filter(|(_, chars)| chars.inter_len(&batch.chars) > 0) .map(|(lvl, _)| lvl.to_string()) .collect::>(); while levels.len() > 2 { levels.remove(1); } batch.level = levels.join("/"); done = done.union(&batch.chars); batch.chars_bad = all_chars.diff(&done); batch.chars_p1 = all_chars.inter(&chars_p1); batch.chars_p2 = all_chars.inter(&chars_p2); chars_p2 = chars_p1; chars_p1 = batch.chars.clone(); } } fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { let match_level = |batch: &Batch, level: &str| { let n5 = batch.level.contains("N5"); let n4 = batch.level.contains("N4"); let n3 = batch.level.contains("N3"); let n2 = batch.level.contains("N2"); let n1 = batch.level.contains("N1"); let n0 = batch.level.contains("N0"); match level { "N5" => n5 || n4 || n3 || n2 || n1 || n0, "N4" => n4 || n3 || n2 || n1 || n0, "N3" => n3 || n2 || n1 || n0, "N2" => n2 || n1 || n0, "N1" => n1 || n0, "N0" => n0, _ => panic!("invalid vocab level {}", level), } }; let mut done = Charset::default(); let mut extra_vocab = vec![]; for (i, batch) in all_batches.iter().enumerate() { let done_after = done.union(&batch.chars); let batch_extra_vocab = vocab .iter() .filter(|v| v.chars.inter_len(&batch.chars) > 0) .filter(|v| match_level(batch, &v.level)) .filter(|v| v.chars.diff(&done_after).len() == 0) .filter(|v| { !all_batches[i..std::cmp::min(all_batches.len(), i + 10)] .iter() .any(|b| { b.examples .iter() .any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji)) }) }) .cloned() .collect::>(); extra_vocab.push(batch_extra_vocab); println!("---- BATCH #{:03} ----", i); for v in batch.extra_vocab.iter() { println!("{}", v.to_string()); } done = done_after; } for (batch, vocab) in all_batches.iter_mut().zip(extra_vocab.into_iter()) { batch.extra_vocab = vocab; } } fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { let mut chars = Charset::default(); let mut char_seen_count: HashMap = HashMap::new(); for (i, batch) in all_batches.iter_mut().enumerate() { println!("---- BATCH #{:03} ----", i); chars = chars.union(&batch.chars); // Count characters in batch in char_seen_count as a lot for ex in batch.examples.iter() { for c in ex.chars.iter() { *char_seen_count.entry(c).or_default() += 5; } } // Take only examples that: // - contain kanji of this batch // - only contain kanji of this or previous batches // - are not in the batch's main example sentences let candidates = examples .iter() .filter(|x| x.chars.inter_len(&batch.chars) > 0) .filter(|x| x.chars.diff(&chars).len() == 0) .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja)); // Take only one candidate sentence for each possible set of represented kanji let mut cand_by_chars = HashMap::new(); for c in candidates { cand_by_chars.insert(c.chars.to_string(), c.clone()); } let mut candidates = cand_by_chars .into_iter() .map(|(_, ex)| ex) .collect::>(); // Sorte candidates in a deterministic random order candidates.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); batch.extra_examples.clear(); let mut batch_char_seen_count: HashMap = HashMap::new(); let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.iter()).flatten()); let mut in_batch_extra = Charset::default(); while batch.extra_examples.len() < 40 { let batch_min_seen = batch .chars .iter() .map(|x| batch_char_seen_count.get(&x).copied().unwrap_or(0)) .min() .unwrap(); // Target chars: chars of the batch that have the less examples let c0 = Charset::from_iter(batch.chars.iter().filter(|x| { batch_char_seen_count.get(x).copied().unwrap_or(0) == batch_min_seen })); // Target chars: chars that have been seen less than cnt times let fc = |cnt| { Charset::from_iter( chars .iter() .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt), ) }; let c1 = fc(5); let c2 = fc(6); let c3 = fc(7); let c4 = fc(10); let best = candidates .iter() .enumerate() .filter(|(_, ex)| { batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0 }) .map(|(i, ex)| { let weight = ( ex.chars.inter_len(&c0), ex.chars.inter_len(&c1), ex.chars.inter_len(&c2), ex.chars.inter_len(&c3), ex.chars.inter_len(&c4), ex.chars.diff(&in_batch_extra).len(), ); (i, ex, weight) }) .max_by_key(|(_, _, w)| *w); if let Some((i, ex, w)) = best { println!("{:?}\t{} - {}", w, ex.ja, ex.en); batch.extra_examples.push(ex.clone()); in_batch = in_batch.union(&ex.chars); in_batch_extra = in_batch_extra.union(&ex.chars); for c in ex.chars.iter() { *char_seen_count.entry(c).or_default() += 1; if batch.chars.contains(c) { *batch_char_seen_count.entry(c).or_default() += 1; } } candidates.remove(i); } else { break; } } batch .extra_examples .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); for i in 1..20 { println!( "Seen {:02}: {}", i, char_seen_count.iter().filter(|(_, v)| **v == i).count() ); } println!( "Seen more: {}", char_seen_count.iter().filter(|(_, v)| **v >= 20).count() ); } } // ===================================================================== // FORMATTING TO HTML // ===================================================================== fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { format_batch_aux(dict_idx, count, i, batch).expect("format batch"); } fn format_batch_aux<'a>( dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch, ) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?); write!( f, r#" Batch #{:03}
"#, i )?; writeln!(f, r#"

index"#)?; for j in 0..count { if j != i { writeln!(f, r#" {:03}"#, j, j)?; } else { writeln!(f, " {:03}", j)?; } } writeln!(f, r#"

"#)?; writeln!(f, "

Level: {}

", batch.level)?; write!(f, r#"

"#)?; let mut ex_prev = Charset::default(); for ex in batch.examples.iter() { let ex_chars = ex.chars.inter(&batch.chars); for c in ex_chars.diff(&ex_prev).chars().iter() { write!( f, r#"{}"#, c, c )?; } ex_prev = ex_prev.union(&ex_chars); } writeln!(f, r#"

"#)?; for ex in batch.examples.iter() { writeln!(f, "
")?; write!(f, r#"

"#)?; for c in ex.ja.chars() { if batch.chars.contains(c) { write!(f, r#"{}"#, c)?; } else if batch.chars_p1.contains(c) { write!(f, r#"{}"#, c)?; } else if batch.chars_p2.contains(c) { write!(f, r#"{}"#, c)?; } else if batch.chars_bad.contains(c) { write!(f, r#"{}"#, c)?; } else { write!(f, "{}", c)?; } } writeln!(f, "

")?; writeln!(f, r#"

{}

"#, ex.en)?; writeln!(f, r#"
Explanation"#)?; let mut expl_batch = Vec::new(); let mut expl_all = Vec::new(); for word in ex.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); let wchars = Charset::new(keb); if !wchars.intersects(&ex.chars) { continue; } if let Some(ents) = dict_idx.get(keb) { for ent in ents.iter() { if let Some(s) = dict_str(keb, reb, ent) { if wchars.intersects(&batch.chars) { expl_batch.push(s); } else { expl_all.push(s); } } } } } for be in expl_batch { writeln!(f, r#"

{}

"#, be)?; } writeln!(f, r#"

"#)?; for c in ex.chars.inter(&batch.chars).chars().iter() { writeln!( f, r#"{}"#, c, c )?; } writeln!(f, r#"

"#)?; for be in expl_all { writeln!(f, r#"

{}

"#, be)?; } writeln!(f, r#"
"#)?; } writeln!(f, "
")?; format_vocab( &mut f, &batch .extra_vocab .iter() .filter(|v| batch.level.contains(&v.level)) .collect::>(), "Extra vocabulary (this level)", )?; format_vocab( &mut f, &batch .extra_vocab .iter() .filter(|v| !batch.level.contains(&v.level)) .collect::>(), "Extra vocabulary (previous levels)", )?; writeln!( f, r#"
Extra examples (reading practice)"# )?; for ex in batch.extra_examples.iter() { let mut expl1 = Vec::new(); let mut expl2 = Vec::new(); for word in ex.expl.split(|c| c == ' ' || c == '~') { let (keb, reb) = expl_clean_word(word); let wchars = Charset::new(keb); if !wchars.intersects(&ex.chars) { continue; } if let Some(ents) = dict_idx.get(keb) { for ent in ents.iter() { if let Some(s) = dict_str_short(keb, reb, ent) { if wchars.intersects(&batch.chars) { expl1.push(s); } else { expl2.push(s); } } } } } expl1.extend(expl2.into_iter()); let expl = expl1.join("
"); writeln!( f, r#""#, ex.ja, ex.en, expl )?; } writeln!(f, r#"
  {}  
{}
{}
"#)?; writeln!(f, "
")?; writeln!(f, "

\(≧▽≦)/

")?; write!(f, "
")?; f.flush()?; Ok(()) } fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> { if !vocab.is_empty() { writeln!( f, r#"
{}"#, t )?; for v in vocab { writeln!( f, r#""#, v.level, v.kanji, v.en, v.kana )?; } writeln!(f, "
{}  {}  {}{}
")?; } Ok(()) } fn expl_clean_word(w: &str) -> (&str, Option<&str>) { let mut ret = w; for delim in ['(', '{', '['] { if let Some((s, _)) = ret.split_once(delim) { ret = s; } } let p = w .split_once('(') .and_then(|(_, r)| r.split_once(')')) .map(|(p, _)| p); (ret, p) } fn dict_str_short<'a>( qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>, ) -> Option { let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); let reb = reb.text().unwrap().trim(); if qreb.map(|x| x != reb).unwrap_or(false) { return None; } Some(format!( r#"{} 【{}】"#, qkeb, reb )) } fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option { let mut ret = dict_str_short(qkeb, qreb, ent)?; for sense in ent.children().filter(|x| x.has_tag_name("sense")) { if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); } } if ret.chars().rev().next() == Some(';') { ret.pop(); } Some(ret) } fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { let mut f = io::BufWriter::new(fs::File::create("public/index.html")?); write!( f, r#" List of batches
"# )?; writeln!(f, r#"

About / How-to


"#)?; writeln!(f, "")?; writeln!(f, "")?; for (i, batch) in batches.iter().enumerate() { writeln!( f, r#""#, i, i, batch.level, batch.chars.to_string(), batch.examples.len(), batch.chars_p1.to_string(), batch.chars_p2.to_string(), batch.chars_bad.to_string() )?; } writeln!(f, r#"
NumLevelKanjiExamplesLesson-1Lesson-2Ignore
{:03}{}{}  {}{}{}{}
"#)?; writeln!(f, "
")?; let all_chars = Charset::from_iter( batches .iter() .map(|x| x.chars.chars().iter().copied()) .flatten(), ); writeln!(f, "")?; writeln!( f, r#""# )?; for (lvl, chars) in kanji_levels.iter() { if lvl == "N0+" || lvl.ends_with("-10") { continue; } let chars = Charset::new(chars); let missing = chars.diff(&all_chars); writeln!( f, r#""#, lvl, chars.len(), chars.to_string(), missing.to_string(), missing.len() )?; } writeln!(f, "
LevelCountKanjiMissing kanji
{}{}{}{} ({})
")?; write!(f, "
")?; f.flush()?; Ok(()) } fn format_about() -> Result<()> { let mut f = io::BufWriter::new(fs::File::create("public/about.html")?); write!( f, r#" Datagengo README "# )?; writeln!(f, r#"
"#)?; writeln!( f, r#"

Back to lessons


"# )?; writeln!( f, "{}", markdown::to_html(&fs::read_to_string("README.md")?) )?; writeln!(f, r#"
"#)?; Ok(()) }