#[macro_use]
extern crate log;
use std::collections::HashMap;
use std::fs;
use std::io;
use std::net::SocketAddr;
//use anyhow::Result;
use rand::prelude::*;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
mod charset;
mod datafiles;
mod example;
mod format;
mod server;
use charset::Charset;
use datafiles::*;
use format::*;
#[global_allocator]
static ALLOCATOR: cap::Cap<std::alloc::System> =
cap::Cap::new(std::alloc::System, usize::max_value());
#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
struct Opt {
#[structopt(subcommand)]
cmd: Cmd,
}
#[derive(Debug, StructOpt)]
enum Cmd {
ParseKanjidic,
ParseJlptVocab,
IndexJmdict,
New {
#[structopt(default_value = "10")]
count: usize,
#[structopt(long = "truncate")]
truncate: Option<usize>,
},
Simplify,
Cleanup,
AddVocab,
Format,
Server {
#[structopt(default_value = "127.0.0.1:8080")]
bind_addr: SocketAddr,
},
}
#[async_std::main]
async fn main() {
if std::env::var("RUST_LOG").is_err() {
std::env::set_var("RUST_LOG", "datagengo=info")
}
pretty_env_logger::init();
let opt = Opt::from_args();
match opt.cmd {
Cmd::ParseKanjidic => {
let levels = parse_kanjidic().expect("error");
for (level, chars) in levels.iter() {
println!("{}: {}", level, chars.to_string());
}
}
Cmd::ParseJlptVocab => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji =
Charset::from_iter(kanji_levels.iter().map(|(_, c)| c.chars()).flatten());
parse_jlpt_vocab(&all_kanji).expect("error");
}
Cmd::IndexJmdict => {
let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict = roxmltree::Document::parse_with_options(
&jmdict,
roxmltree::ParsingOptions {
allow_dtd: true,
..Default::default()
},
)
.expect("parse_jmdict");
let jmdict_idx = index_jmdict(&jmdict);
fs::write(
"data/jmdict_idx.json",
serde_json::to_string_pretty(&jmdict_idx)
.expect("to_json")
.as_bytes(),
)
.expect("write");
}
Cmd::New { truncate, count } => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(
kanji_levels
.iter()
.map(|(_, x)| x.to_string())
.collect::<Vec<_>>()
.join(""),
);
let kanji_levels = kanji_levels
.into_iter()
.map(|(l, x)| (l, Charset::new(x)))
.collect::<Vec<_>>();
let mut ex = read_examples(&all_kanji).expect("read_examples");
ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
let mut batches = read_batches().unwrap_or_default();
if let Some(t) = truncate {
batches.truncate(t);
}
println!("---- starting after {} batches ----", batches.len());
let target_len = batches.len() + count;
gen_batches(&mut batches, target_len, &kanji_levels, &ex);
save_batches(batches).expect("save_batches");
}
Cmd::Simplify => {
let mut batches = read_batches().expect("read_batches");
for batch in batches.iter_mut() {
simplify_batch(batch);
}
save_batches(batches).expect("save_batches");
}
Cmd::Cleanup => {
let mut batches = read_batches().expect("read_batches");
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let kanji_levels = kanji_levels
.into_iter()
.map(|(l, x)| (l, Charset::new(x)))
.collect::<Vec<_>>();
cleanup_batches(&mut batches, &kanji_levels);
save_batches(batches).expect("save_batches");
}
Cmd::AddVocab => {
let mut batches = read_batches().expect("read_batches");
let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab");
add_vocab(&mut batches, &jlpt_vocab);
save_batches(batches).expect("save_batches");
}
Cmd::Format => {
let jmdict_idx = read_jmdict_idx().expect("read jmdict_idx.json");
let batches = read_batches().expect("read/parse");
fs::create_dir_all("public").expect("mkdir public");
fs::copy("static/style.css", "public/style.css").expect("copy style.css");
batches
.iter()
.enumerate()
//.skip(25)
//.take(1)
.for_each(|x| format_batch(&jmdict_idx, batches.len(), x));
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
format_index(&batches, &kanji_levels).expect("format_index");
format_about().expect("format_about");
}
Cmd::Server { bind_addr } => {
server::server_main(bind_addr)
.await
.expect("error in server");
}
}
}
// ----
pub fn read_batches() -> anyhow::Result<Vec<Batch>> {
let json = fs::read("data/batches.json")?;
Ok(serde_json::from_slice::<Vec<Batch>>(&json)?)
}
fn save_batches(batches: Vec<Batch>) -> anyhow::Result<()> {
fs::write(
"data/batches.json",
serde_json::to_string_pretty(&batches)?.as_bytes(),
)?;
Ok(())
}
// =====================================================================
// BATCH STRUCTURES AND GENERATION
// =====================================================================
const CHARS_PER_BATCH: usize = 20;
const MAX_NEW_CHARS_PER_EX: usize = 5;
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct Batch {
pub level: String,
pub chars: Charset,
pub chars_p1: Charset,
pub chars_p2: Charset,
pub chars_bad: Charset,
pub examples: Vec<Example>,
#[serde(default)]
pub extra_vocab: Vec<JlptVocab>,
}
fn gen_batches(
batches: &mut Vec<Batch>,
target_len: usize,
kanji_levels: &[(String, Charset)],
examples: &[Example],
) {
let mut remainder = None;
while batches.len() < target_len {
let done = Charset::from_iter(
batches
.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten(),
);
let remainder_chars = remainder
.as_ref()
.map(|x: &Batch| x.chars.clone())
.unwrap_or_default();
let remainder_before = remainder.clone();
let len_before = batches.len();
let mut advanced = false;
for (i, (level, level_chars)) in kanji_levels.iter().enumerate() {
let diff = level_chars.diff(&done).diff(&remainder_chars);
if !diff.is_empty() {
let avoid = Charset::from_iter(
kanji_levels
.iter()
.skip(i + 1)
.filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
.map(|(_, c)| c.chars().iter().copied())
.flatten(),
);
let level_examples = level_examples(&diff, &avoid, examples);
let level_new_chars = Charset::from_iter(
level_examples
.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten(),
)
.inter(&diff);
println!(
"- {} ({} chars): {} done previously, {} diff, {} ex, {} new chars",
level,
level_chars.len(),
done.len(),
diff.len(),
level_examples.len(),
level_new_chars.len()
);
if !level_examples.is_empty() {
assert!(!level_new_chars.is_empty());
remainder = gen_level(
batches,
level,
&level_new_chars,
&done,
level_examples,
remainder,
);
advanced = true;
break;
}
}
}
if let Some(r) = &remainder {
assert!(r.examples.len() <= 20);
}
if advanced && batches.len() == len_before && remainder == remainder_before {
// restart level with new rng
let last_level = batches.last().unwrap().level.to_string();
println!("RESTARTING LEVEL {}, hopefully new RNG", last_level);
while batches
.last()
.map(|x| x.level == last_level)
.unwrap_or(false)
{
batches.pop();
remainder = None;
}
} else if !advanced {
break;
}
}
if let Some(r) = remainder {
if batches.len() < target_len {
batches.push(r);
}
}
}
fn gen_level(
batches: &mut Vec<Batch>,
level: &str,
new_chars: &Charset,
prev_done: &Charset,
mut examples: Vec<&Example>,
mut remainder: Option<Batch>,
) -> Option<Batch> {
examples.shuffle(&mut thread_rng());
let remainder_chars = remainder.as_ref().map(|x| x.chars.len()).unwrap_or(0);
println!(
"Level {}: {} characters using {} examples, remainder has {} chars and {} examples",
level,
new_chars.len(),
examples.len(),
remainder_chars,
remainder.as_ref().map(|x| x.examples.len()).unwrap_or(0),
);
let avg_len = examples.len() as f32 * CHARS_PER_BATCH as f32 / new_chars.len() as f32;
let mut batch_count = 0;
let mut sum_len = 0;
let mut done = prev_done.union(
remainder
.as_ref()
.map(|x| &x.chars)
.unwrap_or(&Charset::default()),
);
loop {
println!("iter with {} examples", examples.len());
let mut batch = remainder.take().unwrap_or_else(|| Batch {
level: level.to_string(),
..Default::default()
});
let remaining_chars = new_chars.diff(&done);
let todo_chars = CHARS_PER_BATCH - batch.chars.len();
if remaining_chars.len() <= todo_chars {
for ex in examples.iter() {
batch.examples.push((*ex).clone());
batch.chars = batch.chars.union(&ex.chars.diff(&done).inter(&new_chars));
}
if batch.chars.len() == CHARS_PER_BATCH {
println!(
"-> all remaining examples sum up to exaclty {} chars",
CHARS_PER_BATCH
);
batches.push(batch);
return None;
} else if batch.examples.is_empty() {
assert!(batch.chars.is_empty());
println!("-> done");
return None;
} else {
assert!(batch.chars.len() < CHARS_PER_BATCH);
println!(
"-> with all remaining examples, cannot make a full batch, only {} chars",
batch.chars.len()
);
return Some(batch);
}
}
assert!(!examples.is_empty());
println!(
"Trying to add exactly {} characters, using {} examples containing {} new chars",
todo_chars,
examples.len(),
remaining_chars.len()
);
// Compute dynamic algorithm matrix with a bunch of combinations that add `todo_chars`
let mut dyn_mat: Vec<Vec<Option<(Charset, Option<(usize, usize)>)>>> = vec![];
for ex in examples.iter() {
let mut dyn_row = vec![None; todo_chars + 1];
let chars_common = ex.chars.inter(&new_chars).diff(&done);
if chars_common.len() > MAX_NEW_CHARS_PER_EX {
dyn_mat.push(dyn_row);
continue;
}
if chars_common.len() < dyn_row.len() {
dyn_row[chars_common.len()] = Some((chars_common.clone(), None));
}
for (i, dyn_prev) in dyn_mat.iter().enumerate() {
for (j, dpr) in dyn_prev.iter().enumerate() {
if let Some((chars_inter, _prev)) = dpr {
assert_eq!(chars_inter.len(), j);
let new_chars_common = chars_inter.union(&chars_common);
let new_chars_common_len = new_chars_common.len();
if new_chars_common_len > chars_inter.len()
&& new_chars_common_len <= todo_chars
{
dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j))));
}
}
}
}
dyn_mat.push(dyn_row);
}
// Find combination that does that with a good number of examples (tgt_len)
let tgt_len = (avg_len * (batch_count as f32 + 1.)).ceil() as i64
- (sum_len + batch.examples.len()) as i64;
let dyn_mat_cnt = |i| {
let mut cnt = 0;
let mut i: usize = i;
let mut j: usize = todo_chars;
loop {
match &dyn_mat[i][j] {
None => return None,
Some((_, ij_prev)) => {
cnt += 1;
match ij_prev {
Some((iprev, jprev)) => {
i = *iprev;
j = *jprev;
}
None => return Some(cnt),
}
}
}
}
};
let i_opt = (0..dyn_mat.len())
.filter_map(|pos| dyn_mat_cnt(pos).map(|cnt| (pos, cnt)))
.min_by_key(|(_, cnt)| {
let x = *cnt as i64 - tgt_len;
x * x
});
let i = match i_opt {
None => {
println!(
"WARNING: cannot make exactly {} chars, interrupting",
todo_chars
);
return None;
}
Some((pos, _)) => pos,
};
// Take all examples from that combination and add them to current batch
let (mut i, mut j) = (i, todo_chars);
loop {
match &dyn_mat[i][j] {
None => panic!("dyn_mat[{}][{}] == None", i, j),
Some((chars, ij_prev)) => {
println!(
"Add {}: {}",
examples[i].chars.inter(&chars).to_string(),
examples[i].ja
);
batch.examples.push(examples[i].clone());
examples.remove(i);
batch.chars = batch.chars.union(&chars);
match ij_prev {
Some((iprev, jprev)) => {
assert!(*iprev < i);
i = *iprev;
j = *jprev;
}
None => break,
}
}
}
}
assert_eq!(batch.chars.len(), CHARS_PER_BATCH);
println!(
"-> batch {:03}: {} with {} examples",
batches.len(),
batch.chars.to_string(),
batch.examples.len()
);
batch_count += 1;
done = done.union(&batch.chars);
sum_len += batch.examples.len();
batches.push(batch);
}
}
fn level_examples<'a>(
chars: &Charset,
avoid: &Charset,
all_examples: &'a [Example],
) -> Vec<&'a Example> {
println!("Calculating examples for {}", chars.to_string());
let mut todo = chars.clone();
let mut bad = Charset::default();
let mut examples = vec![];
let cost = |ex: &Example, ex_todo_inter: usize, ex_chars_inter: usize| {
(
-(ex.chars.inter_len(&avoid) as i32),
ex_todo_inter,
ex_chars_inter,
-(ex.ja.chars().count() as i32),
ex.chars.len() + thread_rng().gen_range(0..5),
)
};
let mut all_with_inter = all_examples
.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&chars)))
.map(|(ex, ex_chars_inter)| (ex, ex_chars_inter, ex_chars_inter))
.collect::<Vec<_>>();
while !todo.is_empty() {
let best = all_with_inter
.par_iter()
.enumerate()
.filter(|(_, (_, ex_todo_inter, _))| *ex_todo_inter > 0)
//.filter(|(_, (_, _, ex_tgt_inter))| (1..=8).contains(ex_tgt_inter))
.max_by_key(|(_, (ex, ex_todo_inter, ex_chars_inter))| {
cost(*ex, *ex_todo_inter, *ex_chars_inter)
});
if let Some((i, (ex, ex_todo_inter, _))) = best {
let ex = *ex;
assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len());
examples.push(ex);
all_with_inter.remove(i);
todo = todo.diff(&ex.chars);
bad = bad.union(&ex.chars.inter(&avoid));
all_with_inter
.par_iter_mut()
.for_each(|(ex2, ex_todo_inter, _)| {
if ex2.chars.inter_len(&ex.chars) > 0 {
*ex_todo_inter = ex2.chars.inter_len(&todo);
}
});
} else {
break;
}
}
if !todo.is_empty() {
println!("MISSING: NO SENTENCES FOR {}", todo.to_string());
}
if !bad.is_empty() {
println!("USED BAD CHARS: {}", bad.to_string());
}
examples
}
fn simplify_batch(batch: &mut Batch) {
let mut char_cnt = HashMap::<char, usize>::new();
for ex in batch.examples.iter() {
for ch in batch.chars.inter(&ex.chars).chars() {
*char_cnt.entry(*ch).or_default() += 1;
}
}
loop {
let i_opt = batch.examples.iter().position(|ex| {
batch
.chars
.inter(&ex.chars)
.chars()
.iter()
.all(|x| char_cnt[x] >= 2)
});
if let Some(i) = i_opt {
println!(
"Removing {} [{}]",
batch.examples[i].ja,
batch.examples[i].chars.to_string()
);
batch.examples.remove(i);
} else {
break;
}
}
}
fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]) {
let mut chars_p1 = Charset::default();
let mut chars_p2 = Charset::default();
let mut done = Charset::default();
for batch in all_batches.iter_mut() {
let all_chars = Charset::from_iter(
batch
.examples
.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten(),
);
let mut levels = kanji_levels
.iter()
.filter(|(_, chars)| chars.inter_len(&batch.chars) > 0)
.map(|(lvl, _)| lvl.to_string())
.collect::<Vec<_>>();
while levels.len() > 2 {
levels.remove(1);
}
batch.level = levels.join("/");
done = done.union(&batch.chars);
batch.chars_bad = all_chars.diff(&done);
batch.chars_p1 = all_chars.inter(&chars_p1);
batch.chars_p2 = all_chars.inter(&chars_p2);
chars_p2 = chars_p1;
chars_p1 = batch.chars.clone();
}
}
fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
let match_level = |batch: &Batch, level: &str| {
let n5 = batch.level.contains("N5");
let n4 = batch.level.contains("N4");
let n3 = batch.level.contains("N3");
let n2 = batch.level.contains("N2");
let n1 = batch.level.contains("N1");
let n0 = batch.level.contains("N0");
match level {
"N5" => n5 || n4 || n3 || n2 || n1 || n0,
"N4" => n4 || n3 || n2 || n1 || n0,
"N3" => n3 || n2 || n1 || n0,
"N2" => n2 || n1 || n0,
"N1" => n1 || n0,
"N0" => n0,
_ => panic!("invalid vocab level {}", level),
}
};
let mut done = Charset::default();
let mut extra_vocab = vec![];
for (i, batch) in all_batches.iter().enumerate() {
let done_after = done.union(&batch.chars);
let batch_extra_vocab = vocab
.iter()
.filter(|v| v.chars.inter_len(&batch.chars) > 0)
.filter(|v| match_level(batch, &v.level))
.filter(|v| v.chars.diff(&done_after).len() == 0)
.filter(|v| {
!all_batches[i..std::cmp::min(all_batches.len(), i + 10)]
.iter()
.any(|b| {
b.examples
.iter()
.any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))
})
})
.cloned()
.collect::<Vec<_>>();
extra_vocab.push(batch_extra_vocab);
println!("---- BATCH #{:03} ----", i);
for v in batch.extra_vocab.iter() {
println!("{}", v.to_string());
}
done = done_after;
}
for (batch, vocab) in all_batches.iter_mut().zip(extra_vocab.into_iter()) {
batch.extra_vocab = vocab;
}
}