diff options
author | Alex Auvolat <alex@adnab.me> | 2023-09-25 11:52:53 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-09-25 11:52:53 +0200 |
commit | 38f5510ad3f372e97e57be738f9c99d887db0037 (patch) | |
tree | f067cf119996695082eeb42b8770fd12a0828ffb /src | |
parent | ecb42da464e00c9084c23a1ecf96e7be1b97dd25 (diff) | |
download | datagengo-38f5510ad3f372e97e57be738f9c99d887db0037.tar.gz datagengo-38f5510ad3f372e97e57be738f9c99d887db0037.zip |
new algorith seems to be working well
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 336 |
1 files changed, 211 insertions, 125 deletions
diff --git a/src/main.rs b/src/main.rs index a2c9488..335add8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,8 +2,8 @@ use std::collections::HashMap; use std::fs; use std::io::{self, BufRead, Write}; +use anyhow::Result; use rand::prelude::*; -use anyhow::{anyhow, Result}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use structopt::StructOpt; @@ -28,10 +28,7 @@ enum Cmd { truncate: Option<usize>, }, Simplify, - Rebalance { - #[structopt(default_value = "5")] - start: usize, - }, + Cleanup, Format, } @@ -94,15 +91,17 @@ fn main() { ) .expect("save"); } - Cmd::Rebalance { start } => { + Cmd::Cleanup => { let mut batches: Vec<Batch> = fs::read("data/batches.json") .map_err(anyhow::Error::from) .and_then(|x| Ok(serde_json::from_slice(&x)?)) .unwrap_or_default(); let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); - for (level, _) in kanji_levels.iter() { - rebalance_level(level, &mut batches[start..]); - } + let kanji_levels = kanji_levels + .into_iter() + .map(|(l, x)| (l, Charset::new(x))) + .collect::<Vec<_>>(); + cleanup_batches(&mut batches, &kanji_levels); fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) @@ -293,7 +292,10 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { // BATCH STRUCTURES AND GENERATION // ===================================================================== -#[derive(Debug, Clone, Serialize, Deserialize)] +const CHARS_PER_BATCH: usize = 20; +const MAX_NEW_CHARS_PER_EX: usize = 5; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] struct Example { ja: String, en: String, @@ -302,7 +304,7 @@ struct Example { chars: Charset, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] struct Batch { level: String, chars: Charset, @@ -324,14 +326,19 @@ fn gen_batches( batches .iter() .map(|x| x.chars.chars().iter().copied()) - .flatten() - .chain(remainder.as_ref().map(|x: &Batch| x.chars.chars()) - .unwrap_or_default().iter().copied()), + .flatten(), ); + let remainder_chars = remainder + .as_ref() + .map(|x: &Batch| x.chars.clone()) + .unwrap_or_default(); + + let remainder_before = remainder.clone(); + let len_before = batches.len(); let mut advanced = false; - for (i, (level, chars)) in kanji_levels.iter().enumerate() { - let diff = chars.diff(&done); + for (i, (level, level_chars)) in kanji_levels.iter().enumerate() { + let diff = level_chars.diff(&done).diff(&remainder_chars); if !diff.is_empty() { let avoid = Charset::from_iter( kanji_levels @@ -343,15 +350,54 @@ fn gen_batches( ); let level_examples = level_examples(&diff, &avoid, examples); + let level_new_chars = Charset::from_iter( + level_examples + .iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten(), + ) + .inter(&diff); + println!( + "- {} ({} chars): {} done previously, {} diff, {} ex, {} new chars", + level, + level_chars.len(), + done.len(), + diff.len(), + level_examples.len(), + level_new_chars.len() + ); if !level_examples.is_empty() { - remainder = gen_level(batches, level, &diff, &done, &avoid, level_examples, remainder); + assert!(!level_new_chars.is_empty()); + remainder = gen_level( + batches, + level, + &level_new_chars, + &done, + level_examples, + remainder, + ); advanced = true; break; } } } + if let Some(r) = &remainder { + assert!(r.examples.len() <= 20); + } - if !advanced { + if advanced && batches.len() == len_before && remainder == remainder_before { + // restart level with new rng + let last_level = batches.last().unwrap().level.to_string(); + println!("RESTARTING LEVEL {}, hopefully new RNG", last_level); + while batches + .last() + .map(|x| x.level == last_level) + .unwrap_or(false) + { + batches.pop(); + remainder = None; + } + } else if !advanced { break; } } @@ -365,45 +411,82 @@ fn gen_batches( fn gen_level( batches: &mut Vec<Batch>, level: &str, - chars: &Charset, - done: &Charset, - avoid: &Charset, + new_chars: &Charset, + prev_done: &Charset, mut examples: Vec<&Example>, mut remainder: Option<Batch>, ) -> Option<Batch> { examples.shuffle(&mut thread_rng()); + let remainder_chars = remainder.as_ref().map(|x| x.chars.len()).unwrap_or(0); println!( - "Level {}: {} characters using {} examples", + "Level {}: {} characters using {} examples, remainder has {} chars and {} examples", level, - chars.len(), - examples.len() + new_chars.len(), + examples.len(), + remainder_chars, + remainder.as_ref().map(|x| x.examples.len()).unwrap_or(0), ); - let avg_len = examples.len() as f32 * 20. / chars.len() as f32; + let avg_len = examples.len() as f32 * CHARS_PER_BATCH as f32 / new_chars.len() as f32; + let mut batch_count = 0; + let mut sum_len = 0; - let mut done = remainder.as_ref() - .map(|x| x.chars.clone()) - .unwrap_or_default() - .union(&done); + let mut done = prev_done.union( + remainder + .as_ref() + .map(|x| &x.chars) + .unwrap_or(&Charset::default()), + ); - 'outer: while !examples.is_empty() { + loop { println!("iter with {} examples", examples.len()); let mut batch = remainder.take().unwrap_or_else(|| Batch { level: level.to_string(), - chars: Charset::default(), - chars_p1: Charset::default(), - chars_p2: Charset::default(), - chars_bad: Charset::default(), - examples: vec![], + ..Default::default() }); - let n_chars = 20 - batch.chars.len(); + let remaining_chars = new_chars.diff(&done); + let todo_chars = CHARS_PER_BATCH - batch.chars.len(); + if remaining_chars.len() <= todo_chars { + for ex in examples.iter() { + batch.examples.push((*ex).clone()); + batch.chars = batch.chars.union(&ex.chars.diff(&done).inter(&new_chars)); + } + if batch.chars.len() == CHARS_PER_BATCH { + println!( + "-> all remaining examples sum up to exaclty {} chars", + CHARS_PER_BATCH + ); + batches.push(batch); + return None; + } else if batch.examples.is_empty() { + assert!(batch.chars.is_empty()); + println!("-> done"); + return None; + } else { + assert!(batch.chars.len() < CHARS_PER_BATCH); + println!( + "-> with all remaining examples, cannot make a full batch, only {} chars", + batch.chars.len() + ); + return Some(batch); + } + } + assert!(!examples.is_empty()); + println!( + "Trying to add exactly {} characters, using {} examples containing {} new chars", + todo_chars, + examples.len(), + remaining_chars.len() + ); + + // Compute dynamic algorithm matrix with a bunch of combinations that add `todo_chars` let mut dyn_mat: Vec<Vec<Option<(Charset, Option<(usize, usize)>)>>> = vec![]; for ex in examples.iter() { - let mut dyn_row = vec![None; n_chars + 1]; - let chars_common = ex.chars.inter(&chars).diff(&done); - if chars_common.len() > 6 { + let mut dyn_row = vec![None; todo_chars + 1]; + let chars_common = ex.chars.inter(&new_chars).diff(&done); + if chars_common.len() > MAX_NEW_CHARS_PER_EX { dyn_mat.push(dyn_row); continue; } @@ -416,7 +499,9 @@ fn gen_level( assert_eq!(chars_inter.len(), j); let new_chars_common = chars_inter.union(&chars_common); let new_chars_common_len = new_chars_common.len(); - if new_chars_common_len > chars_inter.len() && new_chars_common_len <= n_chars { + if new_chars_common_len > chars_inter.len() + && new_chars_common_len <= todo_chars + { dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j)))); } } @@ -424,13 +509,21 @@ fn gen_level( } dyn_mat.push(dyn_row); } + + // Find combination that does that with a good number of examples (tgt_len) + let factor = match level { + "N1b" => 1.08, + _ => 1.0, + }; + let tgt_len = (avg_len * factor * (batch_count as f32 + 1.)).ceil() as i64 + - (sum_len + batch.examples.len()) as i64; let dyn_mat_cnt = |i| { - let mut i: usize = i; let mut cnt = 0; - let mut j: usize = n_chars; + let mut i: usize = i; + let mut j: usize = todo_chars; loop { match &dyn_mat[i][j] { - None => return 1000000, + None => return None, Some((_, ij_prev)) => { cnt += 1; match ij_prev { @@ -438,22 +531,40 @@ fn gen_level( i = *iprev; j = *jprev; } - None => return cnt, + None => return Some(cnt), } } } } }; - let i = (0..dyn_mat.len()).min_by_key(|i| { - let x = dyn_mat_cnt(*i) as i64 - (avg_len as i64 + 2); - x * x - }).unwrap(); - let (mut i, mut j) = (i, n_chars); + let i_opt = (0..dyn_mat.len()) + .filter_map(|pos| dyn_mat_cnt(pos).map(|cnt| (pos, cnt))) + .min_by_key(|(_, cnt)| { + let x = *cnt as i64 - tgt_len; + x * x + }); + let i = match i_opt { + None => { + println!( + "WARNING: cannot make exactly {} chars, interrupting", + todo_chars + ); + return None; + } + Some((pos, _)) => pos, + }; + + // Take all examples from that combination and add them to current batch + let (mut i, mut j) = (i, todo_chars); loop { match &dyn_mat[i][j] { - None => break 'outer, + None => panic!("dyn_mat[{}][{}] == None", i, j), Some((chars, ij_prev)) => { - println!("Add {}: {}", examples[i].chars.inter(&chars).to_string(), examples[i].ja); + println!( + "Add {}: {}", + examples[i].chars.inter(&chars).to_string(), + examples[i].ja + ); batch.examples.push(examples[i].clone()); examples.remove(i); batch.chars = batch.chars.union(&chars); @@ -468,56 +579,19 @@ fn gen_level( } } } - assert_eq!(batch.chars.len(), 20); - - batch.chars_p1 = batches - .iter() - .rev() - .next() - .map(|b| b.chars.clone()) - .unwrap_or(Charset::default()) - .inter(&batch.chars); + assert_eq!(batch.chars.len(), CHARS_PER_BATCH); - batch.chars_p2 = batches - .iter() - .rev() - .skip(1) - .next() - .map(|b| b.chars.clone()) - .unwrap_or(Charset::default()) - .inter(&batch.chars); - - batch.chars_bad = batch.chars.inter(&avoid); + println!( + "-> batch {:03}: {} with {} examples", + batches.len(), + batch.chars.to_string(), + batch.examples.len() + ); + batch_count += 1; done = done.union(&batch.chars); - - println!("-> batch {:03}: {} with {} examples", batches.len(), batch.chars.to_string(), batch.examples.len()); + sum_len += batch.examples.len(); batches.push(batch); } - - if !examples.is_empty() { - let n_remaining = chars.diff(&done).len(); - let ex_chars = Charset::from_iter( - examples.iter() - .map(|x| x.chars.chars().iter().copied()) - .flatten()).diff(&done); - println!("still {} examples, {} chars in ex, {} total", examples.len(), ex_chars.len(), n_remaining); - assert!(ex_chars.len() <= n_remaining); - if ex_chars.len() > 20 { - return None; - } - let remainder = Batch { - level: level.to_string(), - chars: ex_chars, - chars_p1: Charset::default(), - chars_p2: Charset::default(), - chars_bad: Charset::default(), - examples: examples.iter().map(|x| (**x).clone()).collect(), - }; - println!("remainer batch: {:?}", remainder); - Some(remainder) - } else { - None - } } fn level_examples<'a>( @@ -537,7 +611,7 @@ fn level_examples<'a>( ex_todo_inter, ex_chars_inter, -(ex.ja.chars().count() as i32), - (thread_rng().gen::<usize>()) + ex.chars.len() + thread_rng().gen_range(0..5), ) }; let mut all_with_inter = all_examples @@ -547,11 +621,14 @@ fn level_examples<'a>( .collect::<Vec<_>>(); while !todo.is_empty() { - let best = all_with_inter.par_iter().enumerate() - .filter(|(_, (_, ex_todo_inter, ex_tgt_inter))| (1..=6).contains(ex_todo_inter)) - .max_by_key( - |(_, (ex, ex_todo_inter, ex_chars_inter))| cost(*ex, *ex_todo_inter, *ex_chars_inter), - ); + let best = all_with_inter + .par_iter() + .enumerate() + .filter(|(_, (_, ex_todo_inter, _))| *ex_todo_inter > 0) + //.filter(|(_, (_, _, ex_tgt_inter))| (1..=8).contains(ex_tgt_inter)) + .max_by_key(|(_, (ex, ex_todo_inter, ex_chars_inter))| { + cost(*ex, *ex_todo_inter, *ex_chars_inter) + }); if let Some((i, (ex, ex_todo_inter, _))) = best { let ex = *ex; assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len()); @@ -759,26 +836,34 @@ fn simplify_batch(batch: &mut Batch) { } } -fn rebalance_level(level: &str, batches: &mut [Batch]) { - let mut i_batch = vec![]; - let mut n_ex = 0; - for (i, b) in batches.iter().enumerate() { - if b.level == level { - i_batch.push(i); - n_ex += b.examples.len(); - } - } - if i_batch.len() < 2 { - return; +fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]) { + let mut chars_p1 = Charset::default(); + let mut chars_p2 = Charset::default(); + let mut done = Charset::default(); + + for batch in all_batches.iter_mut() { + let all_chars = Charset::from_iter( + batch + .examples + .iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten(), + ); + + batch.level = kanji_levels + .iter() + .filter(|(_, chars)| chars.inter_len(&batch.chars) > 0) + .map(|(lvl, _)| lvl.to_string()) + .collect::<Vec<_>>() + .join("/"); + done = done.union(&batch.chars); + batch.chars_bad = all_chars.diff(&done); + batch.chars_p1 = all_chars.inter(&chars_p1); + batch.chars_p2 = all_chars.inter(&chars_p2); + + chars_p2 = chars_p1; + chars_p1 = batch.chars.clone(); } - println!( - "Level {}: {} batches, {} examples, avg {:.2}", - level, - i_batch.len(), - n_ex, - n_ex as f32 / i_batch.len() as f32 - ); - //todo!() } // ===================================================================== @@ -941,17 +1026,18 @@ fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result< )?; writeln!(f, "<table>")?; - writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?; + writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>B-1</th><th>B-2</th><th>Ignore</th></tr>")?; for (i, batch) in batches.iter().enumerate() { writeln!( f, - r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td> {}</td><td>{}</td><td>{}</td></tr>"#, + r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td> {}</td><td>{}</td><td>{}</td><td>{}</td></tr>"#, i, i, batch.level, batch.chars.to_string(), batch.examples.len(), - batch.chars_p1.union(&batch.chars_p2).to_string(), + batch.chars_p1.to_string(), + batch.chars_p2.to_string(), batch.chars_bad.to_string() )?; } |