use std::collections::HashMap;
use std::fs;
use std::io::{self, BufRead, Write};
use anyhow::Result;
use rand::prelude::*;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
mod charset;
use charset::Charset;
#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
struct Opt {
#[structopt(subcommand)]
cmd: Cmd,
}
#[derive(Debug, StructOpt)]
enum Cmd {
ParseKanjidic,
ParseJlptVocab,
New {
#[structopt(default_value = "10")]
count: usize,
#[structopt(long = "truncate")]
truncate: Option<usize>,
},
Simplify,
Cleanup,
AddVocab,
Format,
}
fn main() {
let opt = Opt::from_args();
match opt.cmd {
Cmd::ParseKanjidic => {
let levels = parse_kanjidic().expect("error");
for (level, chars) in levels.iter() {
println!("{}: {}", level, chars.to_string());
}
}
Cmd::ParseJlptVocab => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::from_iter(kanji_levels
.iter()
.map(|(_, c)| c.chars())
.flatten());
parse_jlpt_vocab(&all_kanji).expect("error");
}
Cmd::New { truncate, count } => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(
kanji_levels
.iter()
.map(|(_, x)| x.to_string())
.collect::<Vec<_>>()
.join(""),
);
let kanji_levels = kanji_levels
.into_iter()
.map(|(l, x)| (l, Charset::new(x)))
.collect::<Vec<_>>();
let mut ex = read_examples(&all_kanji).expect("read_examples");
ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
let mut batches: Vec<Batch> = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.unwrap_or_default();
if let Some(t) = truncate {
batches.truncate(t);
}
println!("---- starting after {} batches ----", batches.len());
let target_len = batches.len() + count;
gen_batches(&mut batches, target_len, &kanji_levels, &ex);
fs::write(
"data/batches.json",
serde_json::to_string_pretty(&batches)
.expect("serialize")
.as_bytes(),
)
.expect("save");
}
Cmd::Simplify => {
let mut batches: Vec<Batch> = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.unwrap_or_default();
for batch in batches.iter_mut() {
simplify_batch(batch);
}
fs::write(
"data/batches.json",
serde_json::to_string_pretty(&batches)
.expect("serialize")
.as_bytes(),
)
.expect("save");
}
Cmd::Cleanup => {
let mut batches: Vec<Batch> = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.unwrap_or_default();
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let kanji_levels = kanji_levels
.into_iter()
.map(|(l, x)| (l, Charset::new(x)))
.collect::<Vec<_>>();
cleanup_batches(&mut batches, &kanji_levels);
fs::write(
"data/batches.json",
serde_json::to_string_pretty(&batches)
.expect("serialize")
.as_bytes(),
)
.expect("save");
}
Cmd::AddVocab => {
let mut batches: Vec<Batch> = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.unwrap_or_default();
let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab");
add_vocab(&mut batches, &jlpt_vocab);
fs::write(
"data/batches.json",
serde_json::to_string_pretty(&batches)
.expect("serialize")
.as_bytes(),
)
.expect("save");
}
Cmd::Format => {
let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict = roxmltree::Document::parse_with_options(
&jmdict,
roxmltree::ParsingOptions {
allow_dtd: true,
..Default::default()
},
)
.expect("parse_jmdict");
let jmdict_idx = index_jmdict(&jmdict);
let batches = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?))
.expect("read/parse");
fs::create_dir_all("public").expect("mkdir public");
fs::copy("static/style.css", "public/style.css").expect("copy style.css");
batches
.iter()
.enumerate()
.for_each(|x| format_batch(&jmdict_idx, batches.len(), x));
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
format_index(&batches, &kanji_levels).expect("format_index");
}
}
}
// =====================================================================
// PARSING DATA FILES
// =====================================================================
type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
let dict = dict
.root()
.children()
.find(|x| x.has_tag_name("JMdict"))
.unwrap();
let mut ret: DictIndex<'a> = HashMap::new();
for x in dict.children().filter(|x| x.has_tag_name("entry")) {
for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
let txt = keb.text().unwrap().trim();
ret.entry(txt).or_default().push(x);
}
}
}
ret
}
fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
let file = fs::read_to_string("data/kanjidic2.xml")?;
let xml = roxmltree::Document::parse(&file)?;
let kanjidic = xml.root().first_child().unwrap();
assert!(kanjidic.has_tag_name("kanjidic2"));
let mut levels = HashMap::new();
for x in kanjidic.children() {
if !x.has_tag_name("character") {
continue;
}
let mut literal = None;
let mut jlpt = None;
let mut grade = None;
for y in x.children() {
if y.has_tag_name("literal") {
literal = y.text();
}
if y.has_tag_name("misc") {
for z in y.children() {
if z.has_tag_name("jlpt") {
jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
}
if z.has_tag_name("grade") {
grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
}
}
}
}
match grade {
Some(i) if i <= 6 => grade = Some(7),
_ => (),
}
if let Some(lit) = literal {
levels
.entry((jlpt, grade))
.or_insert(String::new())
.extend(lit.chars());
}
}
let mut levels = levels.into_iter().collect::<Vec<_>>();
levels.sort_by_key(|((j, g), _)| match (j, g) {
(Some(j), Some(g)) => (10 - *j) * 20 + *g,
(Some(j), None) => (10 - *j) * 20 + 15,
(None, Some(g)) => 1000 + *g,
(None, None) => 1015,
});
let mut ret = Vec::new();
let mut pc = Charset::default();
for ((j, g), chars) in levels.into_iter() {
let name = match (j, g) {
(Some(j), Some(7)) => format!("N{}a", j),
(Some(j), Some(8)) => format!("N{}b", j),
(Some(j), Some(g)) => format!("N{}-{}", j, g),
(Some(j), None) => format!("N{}+", j),
(None, Some(7)) => format!("N0a"),
(None, Some(8)) => format!("N0b"),
(None, Some(g)) => format!("N0-{}", g),
(None, None) => format!("N0+"),
};
let chars = Charset::new(chars).diff(&pc);
pc = pc.union(&chars);
ret.push((name, chars));
}
Ok(ret)
}
fn read_kanji_levels() -> Result<Vec<(String, String)>> {
Ok(fs::read_to_string("data/kanji_levels.txt")?
.lines()
.filter_map(|l| l.split_once(": "))
.map(|(l, k)| (l.to_string(), k.to_string()))
.collect::<Vec<_>>())
}
fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
let file = fs::File::open("data/examples.utf")?;
let mut ret = Vec::new();
let mut a = "".to_string();
for (i, line) in io::BufReader::new(file).lines().enumerate() {
let line = line?;
if line.starts_with("A:") {
a = line;
} else if line.starts_with("B:") {
let s = a.strip_prefix("A: ");
let t = line.strip_prefix("B: ");
if let (Some(a), Some(b)) = (s, t) {
if let Some((ja, eng)) = a.split_once("\t") {
if let Some((eng, id)) = eng.split_once("#") {
ret.push(Example {
ja: ja.to_string(),
en: eng.to_string(),
expl: b.to_string(),
id: Some(id.to_string()),
chars: Charset::new(ja).inter(all_kanji),
});
} else {
ret.push(Example {
ja: ja.to_string(),
en: eng.to_string(),
expl: b.to_string(),
id: None,
chars: Charset::new(ja).inter(all_kanji),
});
}
}
}
}
if i % 10000 == 0 {
eprintln!("read examples: {}/300 (x1000)", i / 1000);
}
}
Ok(ret)
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
struct JlptVocab {
level: String,
chars: Charset,
kanji: String,
kana: String,
en: String,
}
impl JlptVocab {
fn to_string(&self) -> String {
format!("{}\t{}\t{}\t{}\t{}",
self.level,
self.chars.to_string(),
self.kanji,
self.kana,
self.en)
}
}
fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
let mut vocab = vec![];
vocab.extend(parse_jlpt_vocab_combined("data/n5_vocab.txt", "N4", all_kanji)?);
vocab.extend(parse_jlpt_vocab_split("data/n4_vocab_hiragana.txt", "data/n4_vocab_eng.txt", "N3", all_kanji)?);
vocab.extend(parse_jlpt_vocab_split("data/n3_vocab_hiragana.txt", "data/n3_vocab_eng.txt", "N2a", all_kanji)?);
vocab.extend(parse_jlpt_vocab_split("data/n2_vocab_hiragana.txt", "data/n2_vocab_eng.txt", "N2b", all_kanji)?);
vocab.extend(parse_jlpt_vocab_split("data/n1_vocab_hiragana.txt", "data/n1_vocab_eng.txt", "N1", all_kanji)?);
for v in vocab.iter() {
println!("{}", v.to_string());
}
Ok(())
}
fn parse_jlpt_vocab_combined(file: &str, level: &str, all_kanji: &Charset) -> Result<Vec<JlptVocab>> {
let lines = jlpt_vocab_read_file(file)?;
let mut ret = vec![];
for (kanji, answer) in lines {
let (eng, kana) = match answer.split_once('\n') {
Some((a, b)) => (a, b.trim()),
None => (answer.trim(), ""),
};
for kanji in kanji.split('/') {
ret.push(JlptVocab {
level: level.to_string(),
chars: Charset::new(kanji).inter(all_kanji),
kanji: kanji.to_string(),
kana: kana.to_string(),
en: eng.to_string()
});
}
}
Ok(ret)
}
fn parse_jlpt_vocab_split(kana_file: &str, eng_file: &str, level: &str, all_kanji: &Charset) -> Result<Vec<JlptVocab>> {
let eng_lines = jlpt_vocab_read_file(eng_file)?
.into_iter()
.collect::<HashMap<String, String>>();
let lines = jlpt_vocab_read_file(kana_file)?;
let mut ret = vec![];
for (kanji, kana) in lines {
let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
if let Some(eng) = eng {
for kanji in kanji.split('/') {
ret.push(JlptVocab {
level: level.to_string(),
chars: Charset::new(kanji).inter(all_kanji),
kanji: kanji.to_string(),
kana: kana.to_string(),
en: eng.to_string()
});
}
}
}
Ok(ret)
}
fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
let re = regex::Regex::new(r#"<span class="\w+">"#)?;
let file = fs::File::open(file)?;
let mut ret = vec![];
for line in io::BufReader::new(file).lines() {
let line = line?.replace("<br>", "\n")
.replace("</span>", "");
let line = re.replace_all(&line, "");
if let Some((a, b)) = line.split_once('|') {
ret.push((a.trim().to_string(), b.trim().to_string()));
}
}
Ok(ret)
}
fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
let file = fs::File::open("data/jlpt_vocab.txt")?;
let mut ret = vec![];
for line in io::BufReader::new(file).lines() {
let line = line?;
let line = line.splitn(5, "\t").collect::<Vec<_>>();
if line.len() == 5 {
ret.push(JlptVocab {
level: line[0].to_string(),
chars: Charset::new(line[1]),
kanji: line[2].to_string(),
kana: line[3].to_string(),
en: line[4].to_string(),
});
}
}
Ok(ret)
}
// =====================================================================
// BATCH STRUCTURES AND GENERATION
// =====================================================================
const CHARS_PER_BATCH: usize = 20;
const MAX_NEW_CHARS_PER_EX: usize = 5;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct Example {
ja: String,
en: String,
expl: String,
id: Option<String>,
chars: Charset,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
struct Batch {
level: String,
chars: Charset,
chars_p1: Charset,
chars_p2: Charset,
chars_bad: Charset,
examples: Vec<Example>,
#[serde(default)]
extra_vocab: Vec<JlptVocab>,
}
fn gen_batches(
batches: &mut Vec<Batch>,
target_len: usize,
kanji_levels: &[(String, Charset)],
examples: &[Example],
) {
let mut remainder = None;
while batches.len() < target_len {
let done = Charset::from_iter(
batches
.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten(),
);
let remainder_chars = remainder
.as_ref()
.map(|x: &Batch| x.chars.clone())
.unwrap_or_default();
let remainder_before = remainder.clone();
let len_before = batches.len();
let mut advanced = false;
for (i, (level, level_chars)) in kanji_levels.iter().enumerate() {
let diff = level_chars.diff(&done).diff(&remainder_chars);
if !diff.is_empty() {
let avoid = Charset::from_iter(
kanji_levels
.iter()
.skip(i + 1)
.filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
.map(|(_, c)| c.chars().iter().copied())
.flatten(),
);
let level_examples = level_examples(&diff, &avoid, examples);
let level_new_chars = Charset::from_iter(
level_examples
.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten(),
)
.inter(&diff);
println!(
"- {} ({} chars): {} done previously, {} diff, {} ex, {} new chars",
level,
level_chars.len(),
done.len(),
diff.len(),
level_examples.len(),
level_new_chars.len()
);
if !level_examples.is_empty() {
assert!(!level_new_chars.is_empty());
remainder = gen_level(
batches,
level,
&level_new_chars,
&done,
level_examples,
remainder,
);
advanced = true;
break;
}
}
}
if let Some(r) = &remainder {
assert!(r.examples.len() <= 20);
}
if advanced && batches.len() == len_before && remainder == remainder_before {
// restart level with new rng
let last_level = batches.last().unwrap().level.to_string();
println!("RESTARTING LEVEL {}, hopefully new RNG", last_level);
while batches
.last()
.map(|x| x.level == last_level)
.unwrap_or(false)
{
batches.pop();
remainder = None;
}
} else if !advanced {
break;
}
}
if let Some(r) = remainder {
if batches.len() < target_len {
batches.push(r);
}
}
}
fn gen_level(
batches: &mut Vec<Batch>,
level: &str,
new_chars: &Charset,
prev_done: &Charset,
mut examples: Vec<&Example>,
mut remainder: Option<Batch>,
) -> Option<Batch> {
examples.shuffle(&mut thread_rng());
let remainder_chars = remainder.as_ref().map(|x| x.chars.len()).unwrap_or(0);
println!(
"Level {}: {} characters using {} examples, remainder has {} chars and {} examples",
level,
new_chars.len(),
examples.len(),
remainder_chars,
remainder.as_ref().map(|x| x.examples.len()).unwrap_or(0),
);
let avg_len = examples.len() as f32 * CHARS_PER_BATCH as f32 / new_chars.len() as f32;
let mut batch_count = 0;
let mut sum_len = 0;
let mut done = prev_done.union(
remainder
.as_ref()
.map(|x| &x.chars)
.unwrap_or(&Charset::default()),
);
loop {
println!("iter with {} examples", examples.len());
let mut batch = remainder.take().unwrap_or_else(|| Batch {
level: level.to_string(),
..Default::default()
});
let remaining_chars = new_chars.diff(&done);
let todo_chars = CHARS_PER_BATCH - batch.chars.len();
if remaining_chars.len() <= todo_chars {
for ex in examples.iter() {
batch.examples.push((*ex).clone());
batch.chars = batch.chars.union(&ex.chars.diff(&done).inter(&new_chars));
}
if batch.chars.len() == CHARS_PER_BATCH {
println!(
"-> all remaining examples sum up to exaclty {} chars",
CHARS_PER_BATCH
);
batches.push(batch);
return None;
} else if batch.examples.is_empty() {
assert!(batch.chars.is_empty());
println!("-> done");
return None;
} else {
assert!(batch.chars.len() < CHARS_PER_BATCH);
println!(
"-> with all remaining examples, cannot make a full batch, only {} chars",
batch.chars.len()
);
return Some(batch);
}
}
assert!(!examples.is_empty());
println!(
"Trying to add exactly {} characters, using {} examples containing {} new chars",
todo_chars,
examples.len(),
remaining_chars.len()
);
// Compute dynamic algorithm matrix with a bunch of combinations that add `todo_chars`
let mut dyn_mat: Vec<Vec<Option<(Charset, Option<(usize, usize)>)>>> = vec![];
for ex in examples.iter() {
let mut dyn_row = vec![None; todo_chars + 1];
let chars_common = ex.chars.inter(&new_chars).diff(&done);
if chars_common.len() > MAX_NEW_CHARS_PER_EX {
dyn_mat.push(dyn_row);
continue;
}
if chars_common.len() < dyn_row.len() {
dyn_row[chars_common.len()] = Some((chars_common.clone(), None));
}
for (i, dyn_prev) in dyn_mat.iter().enumerate() {
for (j, dpr) in dyn_prev.iter().enumerate() {
if let Some((chars_inter, _prev)) = dpr {
assert_eq!(chars_inter.len(), j);
let new_chars_common = chars_inter.union(&chars_common);
let new_chars_common_len = new_chars_common.len();
if new_chars_common_len > chars_inter.len()
&& new_chars_common_len <= todo_chars
{
dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j))));
}
}
}
}
dyn_mat.push(dyn_row);
}
// Find combination that does that with a good number of examples (tgt_len)
let factor = match level {
"N1b" => 1.08,
_ => 1.0,
};
let tgt_len = (avg_len * factor * (batch_count as f32 + 1.)).ceil() as i64
- (sum_len + batch.examples.len()) as i64;
let dyn_mat_cnt = |i| {
let mut cnt = 0;
let mut i: usize = i;
let mut j: usize = todo_chars;
loop {
match &dyn_mat[i][j] {
None => return None,
Some((_, ij_prev)) => {
cnt += 1;
match ij_prev {
Some((iprev, jprev)) => {
i = *iprev;
j = *jprev;
}
None => return Some(cnt),
}
}
}
}
};
let i_opt = (0..dyn_mat.len())
.filter_map(|pos| dyn_mat_cnt(pos).map(|cnt| (pos, cnt)))
.min_by_key(|(_, cnt)| {
let x = *cnt as i64 - tgt_len;
x * x
});
let i = match i_opt {
None => {
println!(
"WARNING: cannot make exactly {} chars, interrupting",
todo_chars
);
return None;
}
Some((pos, _)) => pos,
};
// Take all examples from that combination and add them to current batch
let (mut i, mut j) = (i, todo_chars);
loop {
match &dyn_mat[i][j] {
None => panic!("dyn_mat[{}][{}] == None", i, j),
Some((chars, ij_prev)) => {
println!(
"Add {}: {}",
examples[i].chars.inter(&chars).to_string(),
examples[i].ja
);
batch.examples.push(examples[i].clone());
examples.remove(i);
batch.chars = batch.chars.union(&chars);
match ij_prev {
Some((iprev, jprev)) => {
assert!(*iprev < i);
i = *iprev;
j = *jprev;
}
None => break,
}
}
}
}
assert_eq!(batch.chars.len(), CHARS_PER_BATCH);
println!(
"-> batch {:03}: {} with {} examples",
batches.len(),
batch.chars.to_string(),
batch.examples.len()
);
batch_count += 1;
done = done.union(&batch.chars);
sum_len += batch.examples.len();
batches.push(batch);
}
}
fn level_examples<'a>(
chars: &Charset,
avoid: &Charset,
all_examples: &'a [Example],
) -> Vec<&'a Example> {
println!("Calculating examples for {}", chars.to_string());
let mut todo = chars.clone();
let mut bad = Charset::default();
let mut examples = vec![];
let cost = |ex: &Example, ex_todo_inter: usize, ex_chars_inter: usize| {
(
-(ex.chars.inter_len(&avoid) as i32),
ex_todo_inter,
ex_chars_inter,
-(ex.ja.chars().count() as i32),
ex.chars.len() + thread_rng().gen_range(0..5),
)
};
let mut all_with_inter = all_examples
.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&chars)))
.map(|(ex, ex_chars_inter)| (ex, ex_chars_inter, ex_chars_inter))
.collect::<Vec<_>>();
while !todo.is_empty() {
let best = all_with_inter
.par_iter()
.enumerate()
.filter(|(_, (_, ex_todo_inter, _))| *ex_todo_inter > 0)
//.filter(|(_, (_, _, ex_tgt_inter))| (1..=8).contains(ex_tgt_inter))
.max_by_key(|(_, (ex, ex_todo_inter, ex_chars_inter))| {
cost(*ex, *ex_todo_inter, *ex_chars_inter)
});
if let Some((i, (ex, ex_todo_inter, _))) = best {
let ex = *ex;
assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len());
examples.push(ex);
all_with_inter.remove(i);
todo = todo.diff(&ex.chars);
bad = bad.union(&ex.chars.inter(&avoid));
all_with_inter
.par_iter_mut()
.for_each(|(ex2, ex_todo_inter, _)| {
if ex2.chars.inter_len(&ex.chars) > 0 {
*ex_todo_inter = ex2.chars.inter_len(&todo);
}
});
} else {
break;
}
}
if !todo.is_empty() {
println!("MISSING: NO SENTENCES FOR {}", todo.to_string());
}
if !bad.is_empty() {
println!("USED BAD CHARS: {}", bad.to_string());
}
examples
}
/*
let (mut target_i, target_level, mut target_chars) = kanji_levels
.iter()
.enumerate()
.map(|(i, (l, c))| (i, l, c.diff(&prev_chars)))
.find(|(_, _, c)| !c.is_empty())
.ok_or(anyhow!("no more batches to make!"))?;
let chars_p1 = previous
.iter()
.rev()
.next()
.map(|b| b.chars.clone())
.unwrap_or(Charset::default());
let chars_p2 = previous
.iter()
.rev()
.skip(1)
.next()
.map(|b| b.chars.clone())
.unwrap_or(Charset::default());
let mut chars_late = Charset::default();
let mut chars_bad = Charset::from_iter(
kanji_levels
.iter()
.skip(target_i + 1)
.map(|(_, c)| c.chars().iter().copied())
.flatten(),
);
let mut chars_bad_avoid = Charset::from_iter(
kanji_levels
.iter()
.skip(target_i + 1)
.filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
.map(|(_, c)| c.chars().iter().copied())
.flatten(),
);
let mut batch = Batch {
level: target_level.to_string(),
chars: Charset::default(),
chars_p1: Charset::default(),
chars_p2: Charset::default(),
chars_bad: Charset::default(),
examples: Vec::new(),
};
let mut batch_chars = Charset::default();
eprintln!("----");
eprintln!("Level : {}", batch.level);
eprintln!("Target : {}", target_chars.to_string());
eprintln!("Prev1 : {}", chars_p1.to_string());
eprintln!("Prev2 : {}", chars_p2.to_string());
eprintln!("Bad : {} characters", chars_bad.len());
let batch_len = 20;
let mut stalled = false;
while batch.chars.len() < batch_len && !target_chars.is_empty() {
let need = batch_len - batch.chars.len();
let should_add = need > target_chars.len() && target_chars.len() <= 3;
if target_i + 1 < kanji_levels.len() && (should_add || stalled) {
// upgrade to next level
target_i += 1;
chars_late = chars_late.union(&target_chars);
target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars));
chars_bad = chars_bad.diff(&target_chars);
chars_bad_avoid = chars_bad_avoid.diff(&target_chars);
if batch.examples.is_empty() {
batch.level = kanji_levels[target_i].0.to_string();
} else {
batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0);
}
eprintln!("Level : {}", batch.level);
eprintln!("Target: {}", target_chars.to_string());
eprintln!("Late : {}", chars_late.to_string());
eprintln!("Bad : {} characters", chars_bad.len());
stalled = false;
}
/* this one works well enough
let cost = |ex: &Example, ex_tgt_inter: usize| {
20i32 * ex_tgt_inter as i32
+ 30i32 * ex.chars.inter_len(&chars_late) as i32
+ 6i32 * ex.chars.inter_len(&batch.chars) as i32
+ 4i32 * ex.chars.inter_len(&chars_p1) as i32
+ 3i32 * ex.chars.inter_len(&chars_p2) as i32
- 40i32 * ex.chars.inter_len(&chars_bad) as i32
};
*/
let cost = |ex: &Example, ex_tgt_inter: usize| {
(
-(ex.chars.inter_len(&chars_bad_avoid) as i32),
ex_tgt_inter,
ex.chars.inter_len(&chars_late),
2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
-(ex.ja.chars().count() as i32),
)
};
let cand_1 = examples
.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
.filter(|(_, ex_tgt_inter)| {
(1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len
})
.max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter));
let cand = cand_1.or_else(|| {
examples
.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
.filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
.max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
});
if let Some((ex, _)) = cand {
eprintln!(
"* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
ex.chars.inter(&target_chars).to_string(),
ex.chars.inter(&batch.chars).to_string(),
ex.chars.inter(&chars_p1).to_string(),
ex.chars.inter(&chars_p2).to_string(),
ex.chars.inter(&chars_bad).to_string(),
ex.ja
);
batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
target_chars = target_chars.diff(&ex.chars);
chars_late = chars_late.diff(&ex.chars);
batch.examples.push(ex.clone());
batch_chars = batch_chars.union(&ex.chars);
stalled = false;
} else {
if stalled {
eprintln!(
"could not find suitable sentence, stopping batch now (need {})",
need
);
break;
}
stalled = true;
}
}
batch.chars_p1 = chars_p1.inter(&batch_chars);
batch.chars_p2 = chars_p2.inter(&batch_chars);
batch.chars_bad = chars_bad.inter(&batch_chars);
Ok(batch)
*/
fn simplify_batch(batch: &mut Batch) {
let mut char_cnt = HashMap::<char, usize>::new();
for ex in batch.examples.iter() {
for ch in batch.chars.inter(&ex.chars).chars() {
*char_cnt.entry(*ch).or_default() += 1;
}
}
loop {
let i_opt = batch.examples.iter().position(|ex| {
batch
.chars
.inter(&ex.chars)
.chars()
.iter()
.all(|x| char_cnt[x] >= 2)
});
if let Some(i) = i_opt {
println!(
"Removing {} [{}]",
batch.examples[i].ja,
batch.examples[i].chars.to_string()
);
batch.examples.remove(i);
} else {
break;
}
}
}
fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]) {
let mut chars_p1 = Charset::default();
let mut chars_p2 = Charset::default();
let mut done = Charset::default();
for batch in all_batches.iter_mut() {
let all_chars = Charset::from_iter(
batch
.examples
.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten(),
);
batch.level = kanji_levels
.iter()
.filter(|(_, chars)| chars.inter_len(&batch.chars) > 0)
.map(|(lvl, _)| lvl.to_string())
.collect::<Vec<_>>()
.join("/");
done = done.union(&batch.chars);
batch.chars_bad = all_chars.diff(&done);
batch.chars_p1 = all_chars.inter(&chars_p1);
batch.chars_p2 = all_chars.inter(&chars_p2);
chars_p2 = chars_p1;
chars_p1 = batch.chars.clone();
}
}
fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
let match_level = |batch: &Batch, level: &str| {
let n4 = batch.level.contains("N4");
let n3 = batch.level.contains("N3");
let n2 = batch.level.contains("N2");
let n1 = batch.level.contains("N1");
let n0 = batch.level.contains("N0");
match level {
"N4" => n4 || n3 || n2 || n1 || n0,
"N3" => n3 || n2 || n1 || n0,
"N2" | "N2a" | "N2b" => n2 || n1 || n0,
"N1" => n1 || n0,
"N0" => n0,
_ => panic!("invalid vocab level {}", level),
}
};
let mut done = Charset::default();
let mut extra_vocab = vec![];
for (i, batch) in all_batches.iter().enumerate() {
let done_after = done.union(&batch.chars);
let batch_extra_vocab = vocab
.iter()
.filter(|v| v.chars.inter_len(&batch.chars) > 0)
.filter(|v| match_level(batch, &v.level))
.filter(|v| v.chars.diff(&done_after).len() == 0)
.filter(|v| !all_batches[i..std::cmp::min(all_batches.len(), i+10)].iter().any(|b| b.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))))
.cloned()
.collect::<Vec<_>>();
extra_vocab.push(batch_extra_vocab);
println!("---- BATCH #{:03} ----", i);
for v in batch.extra_vocab.iter() {
println!("{}", v.to_string());
}
done = done_after;
}
for (batch, vocab) in all_batches.iter_mut().zip(extra_vocab.into_iter()) {
batch.extra_vocab = vocab;
}
}
// =====================================================================
// FORMATTING TO HTML
// =====================================================================
fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
format_batch_aux(dict_idx, count, i, batch).expect("format batch");
}
fn format_batch_aux<'a>(
dict_idx: &DictIndex<'a>,
count: usize,
i: usize,
batch: &Batch,
) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?);
write!(
f,
r#"<!DOCTYPE html>
<html>
<head>
<meta charset=\"UTF-8\" />
<title>Batch #{:03}</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>"#,
i
)?;
writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
for j in 0..count {
if j != i {
writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
} else {
writeln!(f, " {:03}", j)?;
}
}
writeln!(f, r#"</p>"#)?;
writeln!(f, "<p>Level: {}</p>", batch.level)?;
writeln!(
f,
r#"<p class="ja chars">【{}】</p>"#,
batch.chars.to_string()
)?;
for ex in batch.examples.iter() {
writeln!(f, "<hr />")?;
write!(f, r#"<p class="ja">"#)?;
for c in ex.ja.chars() {
if batch.chars.contains(c) {
write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
} else if batch.chars_p1.contains(c) {
write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
} else if batch.chars_p2.contains(c) {
write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
} else if batch.chars_bad.contains(c) {
write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
} else {
write!(f, "{}", c)?;
}
}
writeln!(f, "</p>")?;
writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?;
writeln!(f, r#"<details><summary>Explanation</summary>"#)?;
let mut expl_batch = Vec::new();
let mut expl_all = Vec::new();
for word in ex.expl.split(|c| c == ' ' || c == '~') {
let (keb, reb) = expl_clean_word(word);
let wchars = Charset::new(keb);
if !wchars.intersects(&ex.chars) {
continue;
}
if let Some(ents) = dict_idx.get(keb) {
for ent in ents.iter() {
if let Some(s) = dict_str(keb, reb, ent) {
if wchars.intersects(&batch.chars) {
expl_batch.push(s);
} else {
expl_all.push(s);
}
}
}
}
}
for be in expl_batch {
writeln!(f, r#"<p>{}</p>"#, be)?;
}
writeln!(f, r#"<p class="chars">"#)?;
for c in ex.chars.inter(&batch.chars).chars().iter() {
writeln!(
f,
r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
c, c
)?;
}
writeln!(f, r#"</p>"#)?;
for be in expl_all {
writeln!(f, r#"<p>{}</p>"#, be)?;
}
writeln!(f, r#"</details>"#)?;
}
writeln!(f, "<hr />")?;
writeln!(f, r#"<details><summary>Extra vocabulary (this level)</summary>"#)?;
for v in batch.extra_vocab.iter() {
if batch.level.contains(&v.level) {
writeln!(f, r#"<p>({}) {} [{}] {}</p>"#, v.level, v.kanji, v.kana, v.en)?;
}
}
writeln!(f, r#"</details>"#)?;
if !batch.level.contains("N4") {
writeln!(f, r#"<details><summary>Extra vocabulary (previous levels)</summary>"#)?;
for v in batch.extra_vocab.iter() {
if !batch.level.contains(&v.level) {
writeln!(f, r#"<p>({}) {} [{}] {}</p>"#, v.level, v.kanji, v.kana, v.en)?;
}
}
writeln!(f, r#"</details>"#)?;
}
writeln!(f, "<hr />")?;
writeln!(f, "<p>\(≧▽≦)/</p>")?;
write!(f, "</body></html>")?;
f.flush()?;
Ok(())
}
fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
let mut ret = w;
for delim in ['(', '{', '['] {
if let Some((s, _)) = ret.split_once(delim) {
ret = s;
}
}
let p = w
.split_once('(')
.and_then(|(_, r)| r.split_once(')'))
.map(|(p, _)| p);
(ret, p)
}
fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> {
let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
let reb = reb.text().unwrap().trim();
if qreb.map(|x| x != reb).unwrap_or(false) {
return None;
}
let mut ret = format!("{} [{}]", qkeb, reb);
for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
}
}
if ret.chars().rev().next() == Some(';') {
ret.pop();
}
Some(ret)
}
fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create("public/index.html")?);
write!(
f,
r#"<!DOCTYPE html>
<html>
<head>
<meta charset=\"UTF-8\" />
<title>List of batches</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>"#
)?;
writeln!(f, "<table>")?;
writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>B-1</th><th>B-2</th><th>Ignore</th></tr>")?;
for (i, batch) in batches.iter().enumerate() {
writeln!(
f,
r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td> {}</td><td>{}</td><td>{}</td><td>{}</td></tr>"#,
i,
i,
batch.level,
batch.chars.to_string(),
batch.examples.len(),
batch.chars_p1.to_string(),
batch.chars_p2.to_string(),
batch.chars_bad.to_string()
)?;
}
writeln!(f, r#"</table>"#)?;
writeln!(f, "<hr />")?;
let all_chars = Charset::from_iter(
batches
.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten(),
);
writeln!(f, "<table>")?;
writeln!(
f,
"<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>"
)?;
for (lvl, chars) in kanji_levels.iter() {
if lvl == "N0+" || lvl == "N0-9" || lvl.ends_with("-10") {
continue;
}
let chars = Charset::new(chars);
let missing = chars.diff(&all_chars);
writeln!(
f,
"<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
lvl,
chars.len(),
chars.to_string(),
missing.to_string(),
missing.len()
)?;
}
writeln!(f, "</table>")?;
write!(f, "</body></html>")?;
f.flush()?;
Ok(())
}