aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-07-21 19:32:24 +0200
committerAlex Auvolat <alex@adnab.me>2023-07-21 19:32:24 +0200
commitbea3b164d3da23d6cf1e41d7f054b03bdf858346 (patch)
tree73bb03eb4bd68126c6a51800544d4f120d53ebfc /src
parent41ab1f6eb77b0c3fe24577f2daec901cedb0ad60 (diff)
downloaddatagengo-bea3b164d3da23d6cf1e41d7f054b03bdf858346.tar.gz
datagengo-bea3b164d3da23d6cf1e41d7f054b03bdf858346.zip
pretty good batches!
Diffstat (limited to 'src')
-rw-r--r--src/main.rs109
1 files changed, 90 insertions, 19 deletions
diff --git a/src/main.rs b/src/main.rs
index 6c67680..60c1ded 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -18,7 +18,10 @@ struct Opt {
#[derive(Debug, StructOpt)]
enum Cmd {
ParseKanjidic,
- New,
+ New {
+ #[structopt(default_value = "10")]
+ count: usize,
+ },
Format,
}
@@ -32,7 +35,7 @@ fn main() {
println!("{}.{}: {}", jlpt, grade, chars);
}
}
- Cmd::New => {
+ Cmd::New{ count } => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(kanji_levels.iter()
.map(|(_, x)| x.to_string())
@@ -48,8 +51,11 @@ fn main() {
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.unwrap_or_default();
println!("---- starting after {} batches ----", batches.len());
- for _ in 0..10 {
+ for _ in 0..count {
let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch");
+ if batch.examples.is_empty() {
+ break;
+ }
batches.push(batch);
}
fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save");
@@ -62,6 +68,9 @@ fn main() {
batches.par_iter()
.enumerate()
.for_each(|x| format_batch(batches.len(), x));
+
+ let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
+ format_index(&batches, &kanji_levels).expect("format_index");
}
}
}
@@ -177,7 +186,7 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
let chars_p2 = previous.iter().rev().skip(1).next()
.map(|b| b.chars.clone()).unwrap_or(Charset::default());
- let mut chars_missing = Charset::default();
+ let mut chars_late = Charset::default();
let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
.map(|(_, c)| c.chars().iter().copied())
@@ -201,13 +210,15 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
eprintln!("Bad : {} characters", chars_bad.len());
let batch_len = 20;
+ let mut stalled = false;
while batch.chars.len() < batch_len && !target_chars.is_empty() {
let need = batch_len - batch.chars.len();
- if need >= 2 && target_chars.len() <= 1 && target_i + 1 < kanji_levels.len() {
+ let should_add = need > target_chars.len() && target_chars.len() <= 3;
+ if target_i + 1 < kanji_levels.len() && (should_add || stalled) {
// upgrade to next level
target_i += 1;
- chars_missing = chars_missing.union(&target_chars);
- target_chars = target_chars.union(&kanji_levels[target_i].1);
+ chars_late = chars_late.union(&target_chars);
+ target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars));
chars_bad = chars_bad.diff(&target_chars);
if batch.examples.is_empty() {
batch.level = kanji_levels[target_i].0.to_string();
@@ -216,19 +227,29 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
}
eprintln!("Level : {}", batch.level);
eprintln!("Target: {}", target_chars.to_string());
- eprintln!("Missing: {}", chars_missing.to_string());
+ eprintln!("Late : {}", chars_late.to_string());
eprintln!("Bad : {} characters", chars_bad.len());
+ stalled = false;
}
- if let Some((ex, _)) = examples.par_iter()
- .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
- .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len)
- .max_by_key(|(ex, ex_tgt_inter)|
- 20i32 * *ex_tgt_inter as i32
- + 30i32 * ex.chars.inter_len(&chars_missing) as i32
+ let cost = |ex: &Example, ex_tgt_inter: usize| {
+ 20i32 * ex_tgt_inter as i32
+ + 30i32 * ex.chars.inter_len(&chars_late) as i32
+ 6i32 * ex.chars.inter_len(&batch.chars) as i32
+ 4i32 * ex.chars.inter_len(&chars_p1) as i32
+ 3i32 * ex.chars.inter_len(&chars_p2) as i32
- - 40i32 * ex.chars.inter_len(&chars_bad) as i32) {
+ - 40i32 * ex.chars.inter_len(&chars_bad) as i32
+ };
+ let cand_1 = examples.par_iter()
+ .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
+ .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len)
+ .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter));
+ let cand = cand_1.or_else(|| {
+ examples.par_iter()
+ .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
+ .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
+ .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
+ });
+ if let Some((ex, _)) = cand {
eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
ex.chars.inter(&target_chars).to_string(),
ex.chars.inter(&batch.chars).to_string(),
@@ -238,12 +259,16 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
ex.ja);
batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
target_chars = target_chars.diff(&ex.chars);
- chars_missing = chars_missing.diff(&ex.chars);
+ chars_late = chars_late.diff(&ex.chars);
batch.examples.push(ex.clone());
batch_chars = batch_chars.union(&ex.chars);
+ stalled = false;
} else {
- eprintln!("could not find suitable sentence, stopping batch now (missing {})", need);
- break;
+ if stalled {
+ eprintln!("could not find suitable sentence, stopping batch now (need {})", need);
+ break;
+ }
+ stalled = true;
}
}
@@ -269,7 +294,7 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
</head>
<body>"#, i)?;
- writeln!(f, "<p>")?;
+ writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
for j in 0..count {
if j != i {
writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
@@ -306,6 +331,52 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
Ok(())
}
+fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
+ let mut f = io::BufWriter::new(fs::File::create("html/index.html")?);
+ write!(f, r#"<!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset=\"UTF-8\" />
+ <title>List of batches</title>
+ <link rel="stylesheet" type="text/css" href="style.css" />
+ </head>
+ <body>"#)?;
+
+ writeln!(f, "<table>")?;
+ writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?;
+ for (i, batch) in batches.iter().enumerate() {
+ writeln!(f, r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td>&nbsp;&nbsp;{}</td><td>{}</td><td>{}</td></tr>"#,
+ i, i,
+ batch.level,
+ batch.chars.to_string(),
+ batch.examples.len(),
+ batch.chars_p1.union(&batch.chars_p2).to_string(),
+ batch.chars_bad.to_string())?;
+ }
+ writeln!(f, r#"</table>"#)?;
+
+ writeln!(f, "<hr />")?;
+
+ let all_chars = Charset::from_iter(batches.iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten());
+ writeln!(f, "<table>")?;
+ writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?;
+ for (lvl, chars) in kanji_levels.iter() {
+ let chars = Charset::new(chars);
+ let missing = chars.diff(&all_chars);
+ writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
+ lvl,
+ chars.len(), chars.to_string(),
+ missing.to_string(), missing.len())?;
+ }
+ writeln!(f, "</table>")?;
+
+ write!(f, "</body></html>")?;
+ f.flush()?;
+ Ok(())
+}
+
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Example {
ja: String,