diff options
author | Alex Auvolat <alex@adnab.me> | 2023-07-21 19:32:24 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-07-21 19:32:24 +0200 |
commit | bea3b164d3da23d6cf1e41d7f054b03bdf858346 (patch) | |
tree | 73bb03eb4bd68126c6a51800544d4f120d53ebfc /src | |
parent | 41ab1f6eb77b0c3fe24577f2daec901cedb0ad60 (diff) | |
download | datagengo-bea3b164d3da23d6cf1e41d7f054b03bdf858346.tar.gz datagengo-bea3b164d3da23d6cf1e41d7f054b03bdf858346.zip |
pretty good batches!
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 109 |
1 files changed, 90 insertions, 19 deletions
diff --git a/src/main.rs b/src/main.rs index 6c67680..60c1ded 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,7 +18,10 @@ struct Opt { #[derive(Debug, StructOpt)] enum Cmd { ParseKanjidic, - New, + New { + #[structopt(default_value = "10")] + count: usize, + }, Format, } @@ -32,7 +35,7 @@ fn main() { println!("{}.{}: {}", jlpt, grade, chars); } } - Cmd::New => { + Cmd::New{ count } => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new(kanji_levels.iter() .map(|(_, x)| x.to_string()) @@ -48,8 +51,11 @@ fn main() { .and_then(|x| Ok(serde_json::from_slice(&x)?)) .unwrap_or_default(); println!("---- starting after {} batches ----", batches.len()); - for _ in 0..10 { + for _ in 0..count { let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch"); + if batch.examples.is_empty() { + break; + } batches.push(batch); } fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save"); @@ -62,6 +68,9 @@ fn main() { batches.par_iter() .enumerate() .for_each(|x| format_batch(batches.len(), x)); + + let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); + format_index(&batches, &kanji_levels).expect("format_index"); } } } @@ -177,7 +186,7 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & let chars_p2 = previous.iter().rev().skip(1).next() .map(|b| b.chars.clone()).unwrap_or(Charset::default()); - let mut chars_missing = Charset::default(); + let mut chars_late = Charset::default(); let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1) .map(|(_, c)| c.chars().iter().copied()) @@ -201,13 +210,15 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & eprintln!("Bad : {} characters", chars_bad.len()); let batch_len = 20; + let mut stalled = false; while batch.chars.len() < batch_len && !target_chars.is_empty() { let need = batch_len - batch.chars.len(); - if need >= 2 && target_chars.len() <= 1 && target_i + 1 < kanji_levels.len() { + let should_add = need > target_chars.len() && target_chars.len() <= 3; + if target_i + 1 < kanji_levels.len() && (should_add || stalled) { // upgrade to next level target_i += 1; - chars_missing = chars_missing.union(&target_chars); - target_chars = target_chars.union(&kanji_levels[target_i].1); + chars_late = chars_late.union(&target_chars); + target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars)); chars_bad = chars_bad.diff(&target_chars); if batch.examples.is_empty() { batch.level = kanji_levels[target_i].0.to_string(); @@ -216,19 +227,29 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & } eprintln!("Level : {}", batch.level); eprintln!("Target: {}", target_chars.to_string()); - eprintln!("Missing: {}", chars_missing.to_string()); + eprintln!("Late : {}", chars_late.to_string()); eprintln!("Bad : {} characters", chars_bad.len()); + stalled = false; } - if let Some((ex, _)) = examples.par_iter() - .map(|ex| (ex, ex.chars.inter_len(&target_chars))) - .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len) - .max_by_key(|(ex, ex_tgt_inter)| - 20i32 * *ex_tgt_inter as i32 - + 30i32 * ex.chars.inter_len(&chars_missing) as i32 + let cost = |ex: &Example, ex_tgt_inter: usize| { + 20i32 * ex_tgt_inter as i32 + + 30i32 * ex.chars.inter_len(&chars_late) as i32 + 6i32 * ex.chars.inter_len(&batch.chars) as i32 + 4i32 * ex.chars.inter_len(&chars_p1) as i32 + 3i32 * ex.chars.inter_len(&chars_p2) as i32 - - 40i32 * ex.chars.inter_len(&chars_bad) as i32) { + - 40i32 * ex.chars.inter_len(&chars_bad) as i32 + }; + let cand_1 = examples.par_iter() + .map(|ex| (ex, ex.chars.inter_len(&target_chars))) + .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len) + .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)); + let cand = cand_1.or_else(|| { + examples.par_iter() + .map(|ex| (ex, ex.chars.inter_len(&target_chars))) + .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0) + .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)) + }); + if let Some((ex, _)) = cand { eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}", ex.chars.inter(&target_chars).to_string(), ex.chars.inter(&batch.chars).to_string(), @@ -238,12 +259,16 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & ex.ja); batch.chars = batch.chars.union(&ex.chars.inter(&target_chars)); target_chars = target_chars.diff(&ex.chars); - chars_missing = chars_missing.diff(&ex.chars); + chars_late = chars_late.diff(&ex.chars); batch.examples.push(ex.clone()); batch_chars = batch_chars.union(&ex.chars); + stalled = false; } else { - eprintln!("could not find suitable sentence, stopping batch now (missing {})", need); - break; + if stalled { + eprintln!("could not find suitable sentence, stopping batch now (need {})", need); + break; + } + stalled = true; } } @@ -269,7 +294,7 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { </head> <body>"#, i)?; - writeln!(f, "<p>")?; + writeln!(f, r#"<p><a href="index.html">index</a>"#)?; for j in 0..count { if j != i { writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?; @@ -306,6 +331,52 @@ fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { Ok(()) } +fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { + let mut f = io::BufWriter::new(fs::File::create("html/index.html")?); + write!(f, r#"<!DOCTYPE html> + <html> + <head> + <meta charset=\"UTF-8\" /> + <title>List of batches</title> + <link rel="stylesheet" type="text/css" href="style.css" /> + </head> + <body>"#)?; + + writeln!(f, "<table>")?; + writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?; + for (i, batch) in batches.iter().enumerate() { + writeln!(f, r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td> {}</td><td>{}</td><td>{}</td></tr>"#, + i, i, + batch.level, + batch.chars.to_string(), + batch.examples.len(), + batch.chars_p1.union(&batch.chars_p2).to_string(), + batch.chars_bad.to_string())?; + } + writeln!(f, r#"</table>"#)?; + + writeln!(f, "<hr />")?; + + let all_chars = Charset::from_iter(batches.iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten()); + writeln!(f, "<table>")?; + writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?; + for (lvl, chars) in kanji_levels.iter() { + let chars = Charset::new(chars); + let missing = chars.diff(&all_chars); + writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>", + lvl, + chars.len(), chars.to_string(), + missing.to_string(), missing.len())?; + } + writeln!(f, "</table>")?; + + write!(f, "</body></html>")?; + f.flush()?; + Ok(()) +} + #[derive(Debug, Clone, Serialize, Deserialize)] struct Example { ja: String, |