diff options
-rw-r--r-- | Cargo.lock | 54 | ||||
-rw-r--r-- | Cargo.toml | 2 | ||||
-rw-r--r-- | src/main.rs | 523 |
3 files changed, 430 insertions, 149 deletions
@@ -109,6 +109,7 @@ name = "datagengo" version = "0.1.0" dependencies = [ "anyhow", + "rand", "rayon", "roxmltree", "serde", @@ -123,6 +124,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" [[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] name = "heck" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -184,6 +196,12 @@ dependencies = [ ] [[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] name = "proc-macro-error" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -226,6 +244,36 @@ dependencies = [ ] [[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] name = "rayon" version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -391,6 +439,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -12,4 +12,4 @@ serde_json = "1.0" structopt = "0.3" rayon = "1.7" roxmltree = "0.18" - +rand = "0.8" diff --git a/src/main.rs b/src/main.rs index 126f4de..8e2ada4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::fs; use std::io::{self, BufRead, Write}; +use rand::prelude::*; use anyhow::{anyhow, Result}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -67,13 +68,8 @@ fn main() { batches.truncate(t); } println!("---- starting after {} batches ----", batches.len()); - for _ in 0..count { - let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch"); - if batch.examples.is_empty() { - break; - } - batches.push(batch); - } + let target_len = batches.len() + count; + gen_batches(&mut batches, target_len, &kanji_levels, &ex); fs::write( "data/batches.json", serde_json::to_string_pretty(&batches) @@ -286,7 +282,7 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { } } if i % 10000 == 0 { - eprintln!("read examples: {}/300", i / 1000); + eprintln!("read examples: {}/300 (x1000)", i / 1000); } } @@ -316,165 +312,396 @@ struct Batch { examples: Vec<Example>, } -fn gen_batch( - previous: &[Batch], +fn gen_batches( + batches: &mut Vec<Batch>, + target_len: usize, kanji_levels: &[(String, Charset)], examples: &[Example], -) -> Result<Batch> { - let prev_chars = Charset::from_iter( - previous - .iter() - .map(|x| x.chars.chars().iter().copied()) - .flatten(), - ); +) { + let mut remainder = None; + while batches.len() < target_len { + let done = Charset::from_iter( + batches + .iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten() + .chain(remainder.as_ref().map(|x: &Batch| x.chars.chars()) + .unwrap_or_default().iter().copied()), + ); + + let mut advanced = false; + for (i, (level, chars)) in kanji_levels.iter().enumerate() { + let diff = chars.diff(&done); + if !diff.is_empty() { + let avoid = Charset::from_iter( + kanji_levels + .iter() + .skip(i + 1) + .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) + .map(|(_, c)| c.chars().iter().copied()) + .flatten(), + ); - let (mut target_i, target_level, mut target_chars) = kanji_levels - .iter() - .enumerate() - .map(|(i, (l, c))| (i, l, c.diff(&prev_chars))) - .find(|(_, _, c)| !c.is_empty()) - .ok_or(anyhow!("no more batches to make!"))?; + let level_examples = level_examples(&diff, &avoid, examples); + if !level_examples.is_empty() { + remainder = gen_level(batches, level, &diff, &done, &avoid, level_examples, remainder); + advanced = true; + break; + } + } + } - let chars_p1 = previous - .iter() - .rev() - .next() - .map(|b| b.chars.clone()) - .unwrap_or(Charset::default()); + if !advanced { + break; + } + } + if let Some(r) = remainder { + if batches.len() < target_len { + batches.push(r); + } + } +} - let chars_p2 = previous - .iter() - .rev() - .skip(1) - .next() - .map(|b| b.chars.clone()) - .unwrap_or(Charset::default()); +fn gen_level( + batches: &mut Vec<Batch>, + level: &str, + chars: &Charset, + done: &Charset, + avoid: &Charset, + mut examples: Vec<&Example>, + mut remainder: Option<Batch>, +) -> Option<Batch> { + examples.shuffle(&mut thread_rng()); - let mut chars_late = Charset::default(); + println!( + "Level {}: {} characters using {} examples", + level, + chars.len(), + examples.len() + ); - let mut chars_bad = Charset::from_iter( - kanji_levels + let mut done = remainder.as_ref() + .map(|x| x.chars.clone()) + .unwrap_or_default() + .union(&done); + + 'outer: while !examples.is_empty() { + println!("iter with {} examples", examples.len()); + let mut batch = remainder.take().unwrap_or_else(|| Batch { + level: level.to_string(), + chars: Charset::default(), + chars_p1: Charset::default(), + chars_p2: Charset::default(), + chars_bad: Charset::default(), + examples: vec![], + }); + let n_chars = 20 - batch.chars.len(); + + let mut dyn_mat: Vec<Vec<Option<(Charset, Option<(usize, usize)>)>>> = vec![]; + for ex in examples.iter() { + let mut dyn_row = vec![None; n_chars + 1]; + let chars_common = ex.chars.inter(&chars).diff(&done); + if chars_common.len() < dyn_row.len() { + dyn_row[chars_common.len()] = Some((chars_common.clone(), None)); + } + for (i, dyn_prev) in dyn_mat.iter().enumerate() { + for (j, dpr) in dyn_prev.iter().enumerate() { + if let Some((chars_inter, _prev)) = dpr { + assert_eq!(chars_inter.len(), j); + let new_chars_common = chars_inter.union(&chars_common); + let new_chars_common_len = new_chars_common.len(); + if new_chars_common_len > chars_inter.len() && new_chars_common_len <= n_chars { + dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j)))); + } + } + } + } + let stop = dyn_row[n_chars].is_some(); + dyn_mat.push(dyn_row); + if stop { + break; + } + } + let (mut i, mut j) = (dyn_mat.len() - 1, n_chars); + loop { + match &dyn_mat[i][j] { + None => break 'outer, + Some((chars, ij_prev)) => { + println!("Add {}: {}", examples[i].chars.inter(&chars).to_string(), examples[i].ja); + batch.examples.push(examples[i].clone()); + examples.remove(i); + batch.chars = batch.chars.union(&chars); + match ij_prev { + Some((iprev, jprev)) => { + assert!(*iprev < i); + i = *iprev; + j = *jprev; + } + None => break, + } + } + } + } + assert_eq!(batch.chars.len(), 20); + + batch.chars_p1 = batches .iter() - .skip(target_i + 1) - .map(|(_, c)| c.chars().iter().copied()) - .flatten(), - ); - let mut chars_bad_avoid = Charset::from_iter( - kanji_levels + .rev() + .next() + .map(|b| b.chars.clone()) + .unwrap_or(Charset::default()) + .inter(&batch.chars); + + batch.chars_p2 = batches .iter() - .skip(target_i + 1) - .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) - .map(|(_, c)| c.chars().iter().copied()) - .flatten(), - ); + .rev() + .skip(1) + .next() + .map(|b| b.chars.clone()) + .unwrap_or(Charset::default()) + .inter(&batch.chars); + + batch.chars_bad = batch.chars.inter(&avoid); + done = done.union(&batch.chars); + + println!("-> batch {:03}: {} with {} examples", batches.len(), batch.chars.to_string(), batch.examples.len()); + batches.push(batch); + } - let mut batch = Batch { - level: target_level.to_string(), - chars: Charset::default(), - chars_p1: Charset::default(), - chars_p2: Charset::default(), - chars_bad: Charset::default(), - examples: Vec::new(), - }; - let mut batch_chars = Charset::default(); - - eprintln!("----"); - eprintln!("Level : {}", batch.level); - eprintln!("Target : {}", target_chars.to_string()); - eprintln!("Prev1 : {}", chars_p1.to_string()); - eprintln!("Prev2 : {}", chars_p2.to_string()); - eprintln!("Bad : {} characters", chars_bad.len()); - - let batch_len = 20; - let mut stalled = false; - while batch.chars.len() < batch_len && !target_chars.is_empty() { - let need = batch_len - batch.chars.len(); - let should_add = need > target_chars.len() && target_chars.len() <= 3; - if target_i + 1 < kanji_levels.len() && (should_add || stalled) { - // upgrade to next level - target_i += 1; - chars_late = chars_late.union(&target_chars); - target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars)); - chars_bad = chars_bad.diff(&target_chars); - chars_bad_avoid = chars_bad_avoid.diff(&target_chars); - if batch.examples.is_empty() { - batch.level = kanji_levels[target_i].0.to_string(); - } else { - batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0); - } - eprintln!("Level : {}", batch.level); - eprintln!("Target: {}", target_chars.to_string()); - eprintln!("Late : {}", chars_late.to_string()); - eprintln!("Bad : {} characters", chars_bad.len()); - stalled = false; - } - /* this one works well enough - let cost = |ex: &Example, ex_tgt_inter: usize| { - 20i32 * ex_tgt_inter as i32 - + 30i32 * ex.chars.inter_len(&chars_late) as i32 - + 6i32 * ex.chars.inter_len(&batch.chars) as i32 - + 4i32 * ex.chars.inter_len(&chars_p1) as i32 - + 3i32 * ex.chars.inter_len(&chars_p2) as i32 - - 40i32 * ex.chars.inter_len(&chars_bad) as i32 - }; - */ - let cost = |ex: &Example, ex_tgt_inter: usize| { - ( - -(ex.chars.inter_len(&chars_bad_avoid) as i32), - ex_tgt_inter, - ex.chars.inter_len(&chars_late), - 2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2), - -(ex.ja.chars().count() as i32), - ) + if !examples.is_empty() { + let n_remaining = chars.diff(&done).len(); + println!("still {} examples, {} chars", examples.len(), n_remaining); + if n_remaining > 20 { + return None; + } + let ex_chars = Charset::from_iter( + examples.iter() + .map(|x| x.chars.chars().iter().copied()) + .flatten()).diff(&done); + assert!(ex_chars.len() <= n_remaining); + let remainder = Batch { + level: level.to_string(), + chars: ex_chars, + chars_p1: Charset::default(), + chars_p2: Charset::default(), + chars_bad: Charset::default(), + examples: examples.iter().map(|x| (**x).clone()).collect(), }; - let cand_1 = examples + println!("remainer batch: {:?}", remainder); + Some(remainder) + } else { + None + } +} + +fn level_examples<'a>( + chars: &Charset, + avoid: &Charset, + all_examples: &'a [Example], +) -> Vec<&'a Example> { + println!("Calculating examples for {}", chars.to_string()); + + let mut todo = chars.clone(); + let mut bad = Charset::default(); + let mut examples = vec![]; + + let cost = |ex: &Example, ex_todo_inter: usize, ex_chars_inter: usize| { + ( + -(ex.chars.inter_len(&avoid) as i32), + ex_todo_inter, + ex_chars_inter, + -(ex.ja.chars().count() as i32), + (thread_rng().gen::<usize>()) + ) + }; + let mut all_with_inter = all_examples + .par_iter() + .map(|ex| (ex, ex.chars.inter_len(&chars))) + .map(|(ex, ex_chars_inter)| (ex, ex_chars_inter, ex_chars_inter)) + .collect::<Vec<_>>(); + + while !todo.is_empty() { + let best = all_with_inter.par_iter().enumerate() + .filter(|(_, (_, ex_todo_inter, _))| (1..=4).contains(ex_todo_inter)) + .max_by_key( + |(_, (ex, ex_todo_inter, ex_chars_inter))| cost(*ex, *ex_todo_inter, *ex_chars_inter), + ); + if let Some((i, (ex, ex_todo_inter, _))) = best { + let ex = *ex; + assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len()); + examples.push(ex); + all_with_inter.remove(i); + todo = todo.diff(&ex.chars); + bad = bad.union(&ex.chars.inter(&avoid)); + all_with_inter + .par_iter_mut() + .for_each(|(ex2, ex_todo_inter, _)| { + if ex2.chars.inter_len(&ex.chars) > 0 { + *ex_todo_inter = ex2.chars.inter_len(&todo); + } + }); + } else { + break; + } + } + if !todo.is_empty() { + println!("MISSING: NO SENTENCES FOR {}", todo.to_string()); + } + if !bad.is_empty() { + println!("USED BAD CHARS: {}", bad.to_string()); + } + examples +} + +/* + +let (mut target_i, target_level, mut target_chars) = kanji_levels + .iter() + .enumerate() + .map(|(i, (l, c))| (i, l, c.diff(&prev_chars))) + .find(|(_, _, c)| !c.is_empty()) + .ok_or(anyhow!("no more batches to make!"))?; + +let chars_p1 = previous + .iter() + .rev() + .next() + .map(|b| b.chars.clone()) + .unwrap_or(Charset::default()); + +let chars_p2 = previous + .iter() + .rev() + .skip(1) + .next() + .map(|b| b.chars.clone()) + .unwrap_or(Charset::default()); + +let mut chars_late = Charset::default(); + +let mut chars_bad = Charset::from_iter( + kanji_levels + .iter() + .skip(target_i + 1) + .map(|(_, c)| c.chars().iter().copied()) + .flatten(), +); +let mut chars_bad_avoid = Charset::from_iter( + kanji_levels + .iter() + .skip(target_i + 1) + .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) + .map(|(_, c)| c.chars().iter().copied()) + .flatten(), +); + +let mut batch = Batch { + level: target_level.to_string(), + chars: Charset::default(), + chars_p1: Charset::default(), + chars_p2: Charset::default(), + chars_bad: Charset::default(), + examples: Vec::new(), +}; +let mut batch_chars = Charset::default(); + +eprintln!("----"); +eprintln!("Level : {}", batch.level); +eprintln!("Target : {}", target_chars.to_string()); +eprintln!("Prev1 : {}", chars_p1.to_string()); +eprintln!("Prev2 : {}", chars_p2.to_string()); +eprintln!("Bad : {} characters", chars_bad.len()); + +let batch_len = 20; +let mut stalled = false; +while batch.chars.len() < batch_len && !target_chars.is_empty() { + let need = batch_len - batch.chars.len(); + let should_add = need > target_chars.len() && target_chars.len() <= 3; + if target_i + 1 < kanji_levels.len() && (should_add || stalled) { + // upgrade to next level + target_i += 1; + chars_late = chars_late.union(&target_chars); + target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars)); + chars_bad = chars_bad.diff(&target_chars); + chars_bad_avoid = chars_bad_avoid.diff(&target_chars); + if batch.examples.is_empty() { + batch.level = kanji_levels[target_i].0.to_string(); + } else { + batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0); + } + eprintln!("Level : {}", batch.level); + eprintln!("Target: {}", target_chars.to_string()); + eprintln!("Late : {}", chars_late.to_string()); + eprintln!("Bad : {} characters", chars_bad.len()); + stalled = false; + } + /* this one works well enough + let cost = |ex: &Example, ex_tgt_inter: usize| { + 20i32 * ex_tgt_inter as i32 + + 30i32 * ex.chars.inter_len(&chars_late) as i32 + + 6i32 * ex.chars.inter_len(&batch.chars) as i32 + + 4i32 * ex.chars.inter_len(&chars_p1) as i32 + + 3i32 * ex.chars.inter_len(&chars_p2) as i32 + - 40i32 * ex.chars.inter_len(&chars_bad) as i32 + }; + */ + let cost = |ex: &Example, ex_tgt_inter: usize| { + ( + -(ex.chars.inter_len(&chars_bad_avoid) as i32), + ex_tgt_inter, + ex.chars.inter_len(&chars_late), + 2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2), + -(ex.ja.chars().count() as i32), + ) + }; + let cand_1 = examples + .par_iter() + .map(|ex| (ex, ex.chars.inter_len(&target_chars))) + .filter(|(_, ex_tgt_inter)| { + (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len + }) + .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)); + let cand = cand_1.or_else(|| { + examples .par_iter() .map(|ex| (ex, ex.chars.inter_len(&target_chars))) - .filter(|(_, ex_tgt_inter)| { - (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len - }) - .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)); - let cand = cand_1.or_else(|| { - examples - .par_iter() - .map(|ex| (ex, ex.chars.inter_len(&target_chars))) - .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0) - .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)) - }); - if let Some((ex, _)) = cand { + .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0) + .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter)) + }); + if let Some((ex, _)) = cand { + eprintln!( + "* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}", + ex.chars.inter(&target_chars).to_string(), + ex.chars.inter(&batch.chars).to_string(), + ex.chars.inter(&chars_p1).to_string(), + ex.chars.inter(&chars_p2).to_string(), + ex.chars.inter(&chars_bad).to_string(), + ex.ja + ); + batch.chars = batch.chars.union(&ex.chars.inter(&target_chars)); + target_chars = target_chars.diff(&ex.chars); + chars_late = chars_late.diff(&ex.chars); + batch.examples.push(ex.clone()); + batch_chars = batch_chars.union(&ex.chars); + stalled = false; + } else { + if stalled { eprintln!( - "* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}", - ex.chars.inter(&target_chars).to_string(), - ex.chars.inter(&batch.chars).to_string(), - ex.chars.inter(&chars_p1).to_string(), - ex.chars.inter(&chars_p2).to_string(), - ex.chars.inter(&chars_bad).to_string(), - ex.ja + "could not find suitable sentence, stopping batch now (need {})", + need ); - batch.chars = batch.chars.union(&ex.chars.inter(&target_chars)); - target_chars = target_chars.diff(&ex.chars); - chars_late = chars_late.diff(&ex.chars); - batch.examples.push(ex.clone()); - batch_chars = batch_chars.union(&ex.chars); - stalled = false; - } else { - if stalled { - eprintln!( - "could not find suitable sentence, stopping batch now (need {})", - need - ); - break; - } - stalled = true; + break; } + stalled = true; } +} - batch.chars_p1 = chars_p1.inter(&batch_chars); - batch.chars_p2 = chars_p2.inter(&batch_chars); - batch.chars_bad = chars_bad.inter(&batch_chars); +batch.chars_p1 = chars_p1.inter(&batch_chars); +batch.chars_p2 = chars_p2.inter(&batch_chars); +batch.chars_bad = chars_bad.inter(&batch_chars); - Ok(batch) -} +Ok(batch) + */ fn simplify_batch(batch: &mut Batch) { let mut char_cnt = HashMap::<char, usize>::new(); |