use std::collections::HashMap;
use std::fs;
use std::cmp::Ordering;
use std::io::{self, BufRead, Write};
use anyhow::{anyhow, Result};
use rayon::prelude::*;
use serde::{Serialize, Deserialize};
use structopt::StructOpt;
#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
struct Opt {
#[structopt(subcommand)]
cmd: Cmd,
}
#[derive(Debug, StructOpt)]
enum Cmd {
ParseKanjidic,
New {
#[structopt(default_value = "10")]
count: usize,
},
Format,
}
fn main() {
let opt = Opt::from_args();
match opt.cmd {
Cmd::ParseKanjidic => {
let levels = parse_kanjidic().expect("error");
for (jlpt, grade, chars) in levels.iter() {
println!("{}.{}: {}", jlpt, grade, chars);
}
}
Cmd::New{ count } => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(kanji_levels.iter()
.map(|(_, x)| x.to_string())
.collect::<Vec<_>>()
.join(""));
let kanji_levels = kanji_levels.into_iter()
.map(|(l, x)| (l, Charset::new(x)))
.collect::<Vec<_>>();
let mut ex = read_examples(&all_kanji).expect("read_examples");
ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
let mut batches: Vec<Batch> = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.unwrap_or_default();
println!("---- starting after {} batches ----", batches.len());
for _ in 0..count {
let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch");
if batch.examples.is_empty() {
break;
}
batches.push(batch);
}
fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save");
}
Cmd::Format => {
let batches = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?))
.expect("read/parse");
batches.par_iter()
.enumerate()
.for_each(|x| format_batch(batches.len(), x));
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
format_index(&batches, &kanji_levels).expect("format_index");
}
}
}
fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> {
let file = fs::read_to_string("data/kanjidic2.xml")?;
let xml = roxmltree::Document::parse(&file)?;
let kanjidic = xml.root().first_child().unwrap();
assert!(kanjidic.has_tag_name("kanjidic2"));
let mut levels = HashMap::new();
for x in kanjidic.children() {
if !x.has_tag_name("character") {
continue;
}
let mut literal = None;
let mut jlpt = None;
let mut grade = None;
for y in x.children() {
if y.has_tag_name("literal") {
literal = y.text();
}
if y.has_tag_name("misc") {
for z in y.children() {
if z.has_tag_name("grade") {
grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
}
if z.has_tag_name("jlpt") {
jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
}
}
}
}
if jlpt.is_none() && grade.is_none() {
continue;
}
let level = (jlpt.unwrap_or(0), grade.unwrap_or(0));
if let Some(lit) = literal {
levels.entry(level).or_insert(String::new()).extend(lit.chars());
}
}
let mut levels = levels.into_iter().map(|((j, g), c)| (j, g, c)).collect::<Vec<_>>();
levels.sort_by_key(|(j, g, _)| (-*j, *g));
Ok(levels)
}
fn read_kanji_levels() -> Result<Vec<(String, String)>> {
Ok(fs::read_to_string("data/kanji_levels.txt")?
.lines()
.filter_map(|l| l.split_once(": "))
.map(|(l, k)| (l.to_string(), k.to_string()))
.collect::<Vec<_>>())
}
fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
let file = fs::File::open("data/examples.utf")?;
let mut ret = Vec::new();
let mut a = "".to_string();
for (i, line) in io::BufReader::new(file).lines().enumerate() {
let line = line?;
if line.starts_with("A:") {
a = line;
} else if line.starts_with("B:") {
let s = a.strip_prefix("A: ");
let t = line.strip_prefix("B: ");
if let (Some(a), Some(b)) = (s, t) {
if let Some((ja, eng)) = a.split_once("\t") {
if let Some((eng, id)) = eng.split_once("#") {
ret.push(Example {
ja: ja.to_string(),
en: eng.to_string(),
expl: b.to_string(),
id: Some(id.to_string()),
chars: Charset::new(ja).inter(all_kanji),
});
} else {
ret.push(Example {
ja: ja.to_string(),
en: eng.to_string(),
expl: b.to_string(),
id: None,
chars: Charset::new(ja).inter(all_kanji),
});
}
}
}
}
if i % 10000 == 0 {
eprintln!("read examples: {}/300", i/1000);
}
}
Ok(ret)
}
fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &[Example]) -> Result<Batch> {
let prev_chars = Charset::from_iter(previous.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten());
let (mut target_i, target_level, mut target_chars) = kanji_levels.iter().enumerate()
.map(|(i, (l, c))| (i, l, c.diff(&prev_chars)))
.find(|(_, _, c)| !c.is_empty())
.ok_or(anyhow!("no more batches to make!"))?;
let chars_p1 = previous.iter().rev().next()
.map(|b| b.chars.clone()).unwrap_or(Charset::default());
let chars_p2 = previous.iter().rev().skip(1).next()
.map(|b| b.chars.clone()).unwrap_or(Charset::default());
let mut chars_late = Charset::default();
let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
.map(|(_, c)| c.chars().iter().copied())
.flatten());
let mut batch = Batch {
level: target_level.to_string(),
chars: Charset::default(),
chars_p1: Charset::default(),
chars_p2: Charset::default(),
chars_bad: Charset::default(),
examples: Vec::new(),
};
let mut batch_chars = Charset::default();
eprintln!("----");
eprintln!("Level : {}", batch.level);
eprintln!("Target : {}", target_chars.to_string());
eprintln!("Prev1 : {}", chars_p1.to_string());
eprintln!("Prev2 : {}", chars_p2.to_string());
eprintln!("Bad : {} characters", chars_bad.len());
let batch_len = 20;
let mut stalled = false;
while batch.chars.len() < batch_len && !target_chars.is_empty() {
let need = batch_len - batch.chars.len();
let should_add = need > target_chars.len() && target_chars.len() <= 3;
if target_i + 1 < kanji_levels.len() && (should_add || stalled) {
// upgrade to next level
target_i += 1;
chars_late = chars_late.union(&target_chars);
target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars));
chars_bad = chars_bad.diff(&target_chars);
if batch.examples.is_empty() {
batch.level = kanji_levels[target_i].0.to_string();
} else {
batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0);
}
eprintln!("Level : {}", batch.level);
eprintln!("Target: {}", target_chars.to_string());
eprintln!("Late : {}", chars_late.to_string());
eprintln!("Bad : {} characters", chars_bad.len());
stalled = false;
}
let cost = |ex: &Example, ex_tgt_inter: usize| {
20i32 * ex_tgt_inter as i32
+ 30i32 * ex.chars.inter_len(&chars_late) as i32
+ 6i32 * ex.chars.inter_len(&batch.chars) as i32
+ 4i32 * ex.chars.inter_len(&chars_p1) as i32
+ 3i32 * ex.chars.inter_len(&chars_p2) as i32
- 40i32 * ex.chars.inter_len(&chars_bad) as i32
};
let cand_1 = examples.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
.filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len)
.max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter));
let cand = cand_1.or_else(|| {
examples.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
.filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
.max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
});
if let Some((ex, _)) = cand {
eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
ex.chars.inter(&target_chars).to_string(),
ex.chars.inter(&batch.chars).to_string(),
ex.chars.inter(&chars_p1).to_string(),
ex.chars.inter(&chars_p2).to_string(),
ex.chars.inter(&chars_bad).to_string(),
ex.ja);
batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
target_chars = target_chars.diff(&ex.chars);
chars_late = chars_late.diff(&ex.chars);
batch.examples.push(ex.clone());
batch_chars = batch_chars.union(&ex.chars);
stalled = false;
} else {
if stalled {
eprintln!("could not find suitable sentence, stopping batch now (need {})", need);
break;
}
stalled = true;
}
}
batch.chars_p1 = chars_p1.inter(&batch_chars);
batch.chars_p2 = chars_p2.inter(&batch_chars);
batch.chars_bad = chars_bad.inter(&batch_chars);
Ok(batch)
}
fn format_batch(count: usize, (i, batch): (usize, &Batch)) {
format_batch_aux(count, i, batch).expect("format batch");
}
fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create(format!("html/{:03}.html", i))?);
write!(f, r#"<!DOCTYPE html>
<html>
<head>
<meta charset=\"UTF-8\" />
<title>Batch #{:03}</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>"#, i)?;
writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
for j in 0..count {
if j != i {
writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
} else {
writeln!(f, " {:03}", j)?;
}
}
writeln!(f, r#"</p>"#)?;
writeln!(f, "<p>Level: {}</p>", batch.level)?;
writeln!(f, "<p>Characters: {}</p>", batch.chars.to_string())?;
for ex in batch.examples.iter() {
writeln!(f, "<hr />")?;
write!(f, r#"<p class="ja">"#)?;
for c in ex.ja.chars() {
if batch.chars.contains(c) {
write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
} else if batch.chars_p1.contains(c) {
write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
} else if batch.chars_p2.contains(c) {
write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
} else if batch.chars_bad.contains(c) {
write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
} else {
write!(f, "{}", c)?;
}
}
writeln!(f, "</p>")?;
writeln!(f, r#"<p style="text-align: center; font-size: 1.2em">{}</p>"#, ex.en)?;
}
write!(f, "</body></html>")?;
f.flush()?;
Ok(())
}
fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create("html/index.html")?);
write!(f, r#"<!DOCTYPE html>
<html>
<head>
<meta charset=\"UTF-8\" />
<title>List of batches</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>"#)?;
writeln!(f, "<table>")?;
writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?;
for (i, batch) in batches.iter().enumerate() {
writeln!(f, r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td> {}</td><td>{}</td><td>{}</td></tr>"#,
i, i,
batch.level,
batch.chars.to_string(),
batch.examples.len(),
batch.chars_p1.union(&batch.chars_p2).to_string(),
batch.chars_bad.to_string())?;
}
writeln!(f, r#"</table>"#)?;
writeln!(f, "<hr />")?;
let all_chars = Charset::from_iter(batches.iter()
.map(|x| x.chars.chars().iter().copied())
.flatten());
writeln!(f, "<table>")?;
writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?;
for (lvl, chars) in kanji_levels.iter() {
let chars = Charset::new(chars);
let missing = chars.diff(&all_chars);
writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
lvl,
chars.len(), chars.to_string(),
missing.to_string(), missing.len())?;
}
writeln!(f, "</table>")?;
write!(f, "</body></html>")?;
f.flush()?;
Ok(())
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Example {
ja: String,
en: String,
expl: String,
id: Option<String>,
chars: Charset,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Batch {
level: String,
chars: Charset,
chars_p1: Charset,
chars_p2: Charset,
chars_bad: Charset,
examples: Vec<Example>,
}
#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)]
struct Charset(Vec<char>);
impl Charset {
fn new<S: AsRef<str>>(s: S) -> Self {
let mut chars = s.as_ref().chars().collect::<Vec<_>>();
chars.sort();
chars.dedup();
Self(chars)
}
fn from_iter<S: IntoIterator<Item=char>>(s: S) -> Self {
let mut chars = s.into_iter().collect::<Vec<_>>();
chars.sort();
chars.dedup();
Self(chars)
}
fn intersects(&self, other: &Self) -> bool {
let mut it1 = self.0.iter().peekable();
let mut it2 = other.0.iter().peekable();
while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
match c1.cmp(c2) {
Ordering::Equal => return true,
Ordering::Less => it1.next(),
Ordering::Greater => it2.next(),
};
}
false
}
fn inter_len(&self, other: &Self) -> usize {
let mut it1 = self.0.iter().peekable();
let mut it2 = other.0.iter().peekable();
let mut ret = 0;
while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
match c1.cmp(c2) {
Ordering::Equal => {
ret += 1;
it1.next();
it2.next();
}
Ordering::Less => {
it1.next();
}
Ordering::Greater => {
it2.next();
}
};
}
ret
}
fn inter(&self, other: &Self) -> Charset {
let mut it1 = self.0.iter().peekable();
let mut it2 = other.0.iter().peekable();
let mut ret = Vec::new();
while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
match c1.cmp(c2) {
Ordering::Equal => {
ret.push(**c1);
it1.next();
it2.next();
}
Ordering::Less => {
it1.next();
}
Ordering::Greater => {
it2.next();
}
};
}
Self(ret)
}
fn union(&self, other: &Self) -> Charset {
let mut it1 = self.0.iter().peekable();
let mut it2 = other.0.iter().peekable();
let mut ret = Vec::new();
while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
match c1.cmp(c2) {
Ordering::Equal => {
ret.push(**c1);
it1.next();
it2.next();
}
Ordering::Less => {
ret.push(**c1);
it1.next();
}
Ordering::Greater => {
ret.push(**c2);
it2.next();
}
};
}
while let Some(c) = it1.peek() {
ret.push(**c);
it1.next();
}
while let Some(c) = it2.peek() {
ret.push(**c);
it2.next();
}
Self(ret)
}
fn diff(&self, other: &Self) -> Charset {
let mut it1 = self.0.iter().peekable();
let mut it2 = other.0.iter().peekable();
let mut ret = Vec::new();
while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
match c1.cmp(c2) {
Ordering::Equal => {
it1.next();
it2.next();
}
Ordering::Less => {
ret.push(**c1);
it1.next();
}
Ordering::Greater => {
it2.next();
}
};
}
while let Some(c) = it1.peek() {
ret.push(**c);
it1.next();
}
Self(ret)
}
fn len(&self) -> usize {
self.0.len()
}
fn is_empty(&self) -> bool {
self.0.is_empty()
}
fn chars(&self) -> &[char] {
&self.0
}
fn contains(&self, c: char) -> bool {
self.0.binary_search(&c).is_ok()
}
fn to_string(&self) -> String {
self.0.iter().collect::<String>()
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_charset() {
let c1 = Charset::new("azerty");
let c2 = Charset::new("uiopqsqdf");
let c3 = Charset::new("hello, world");
assert!(!c1.intersects(&c2));
assert!(c1.intersects(&c3));
assert!(c2.intersects(&c3));
assert_eq!(c1.inter_len(&c2), 0);
assert_eq!(c1.inter_len(&c3), 2);
assert_eq!(c2.inter_len(&c3), 2);
assert_eq!(c1.inter(&c2), Charset::new(""));
assert_eq!(c1.inter(&c3), Charset::new("er"));
assert_eq!(c2.inter(&c3), Charset::new("od"));
assert_eq!(c1.union(&c2), Charset::new("azertyuiopqsdf"));
assert_eq!(c1.union(&c3), Charset::new("azertyhello, world"));
assert_eq!(c2.union(&c3), Charset::new("uiopqsdfhello, world"));
assert_eq!(c1.diff(&c2), Charset::new("azerty"));
assert_eq!(c1.diff(&c3), Charset::new("azty"));
assert_eq!(c2.diff(&c3), Charset::new("uipqsf"));
assert_eq!(c2.diff(&c1), Charset::new("uiopqsdf"));
assert_eq!(c3.diff(&c1), Charset::new("hllo, wold"));
assert_eq!(c3.diff(&c2), Charset::new("hell, wrl"));
}
}