aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-09-24 21:45:32 +0200
committerAlex Auvolat <alex@adnab.me>2023-09-24 21:45:32 +0200
commitfcc8cdf1afcd18029114117fff26635a461db138 (patch)
tree4f7d952712eff2e234f90d63b5c4942ad709c804 /src
parente0f80b10d7874a8317ea0ae4a621fc3556eac491 (diff)
downloaddatagengo-fcc8cdf1afcd18029114117fff26635a461db138.tar.gz
datagengo-fcc8cdf1afcd18029114117fff26635a461db138.zip
First try of a new dynamic algo?? dunno if it will work, we are stuck at N1
Diffstat (limited to 'src')
-rw-r--r--src/main.rs523
1 files changed, 375 insertions, 148 deletions
diff --git a/src/main.rs b/src/main.rs
index 126f4de..8e2ada4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
use std::fs;
use std::io::{self, BufRead, Write};
+use rand::prelude::*;
use anyhow::{anyhow, Result};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
@@ -67,13 +68,8 @@ fn main() {
batches.truncate(t);
}
println!("---- starting after {} batches ----", batches.len());
- for _ in 0..count {
- let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch");
- if batch.examples.is_empty() {
- break;
- }
- batches.push(batch);
- }
+ let target_len = batches.len() + count;
+ gen_batches(&mut batches, target_len, &kanji_levels, &ex);
fs::write(
"data/batches.json",
serde_json::to_string_pretty(&batches)
@@ -286,7 +282,7 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
}
}
if i % 10000 == 0 {
- eprintln!("read examples: {}/300", i / 1000);
+ eprintln!("read examples: {}/300 (x1000)", i / 1000);
}
}
@@ -316,165 +312,396 @@ struct Batch {
examples: Vec<Example>,
}
-fn gen_batch(
- previous: &[Batch],
+fn gen_batches(
+ batches: &mut Vec<Batch>,
+ target_len: usize,
kanji_levels: &[(String, Charset)],
examples: &[Example],
-) -> Result<Batch> {
- let prev_chars = Charset::from_iter(
- previous
- .iter()
- .map(|x| x.chars.chars().iter().copied())
- .flatten(),
- );
+) {
+ let mut remainder = None;
+ while batches.len() < target_len {
+ let done = Charset::from_iter(
+ batches
+ .iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten()
+ .chain(remainder.as_ref().map(|x: &Batch| x.chars.chars())
+ .unwrap_or_default().iter().copied()),
+ );
+
+ let mut advanced = false;
+ for (i, (level, chars)) in kanji_levels.iter().enumerate() {
+ let diff = chars.diff(&done);
+ if !diff.is_empty() {
+ let avoid = Charset::from_iter(
+ kanji_levels
+ .iter()
+ .skip(i + 1)
+ .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
+ .map(|(_, c)| c.chars().iter().copied())
+ .flatten(),
+ );
- let (mut target_i, target_level, mut target_chars) = kanji_levels
- .iter()
- .enumerate()
- .map(|(i, (l, c))| (i, l, c.diff(&prev_chars)))
- .find(|(_, _, c)| !c.is_empty())
- .ok_or(anyhow!("no more batches to make!"))?;
+ let level_examples = level_examples(&diff, &avoid, examples);
+ if !level_examples.is_empty() {
+ remainder = gen_level(batches, level, &diff, &done, &avoid, level_examples, remainder);
+ advanced = true;
+ break;
+ }
+ }
+ }
- let chars_p1 = previous
- .iter()
- .rev()
- .next()
- .map(|b| b.chars.clone())
- .unwrap_or(Charset::default());
+ if !advanced {
+ break;
+ }
+ }
+ if let Some(r) = remainder {
+ if batches.len() < target_len {
+ batches.push(r);
+ }
+ }
+}
- let chars_p2 = previous
- .iter()
- .rev()
- .skip(1)
- .next()
- .map(|b| b.chars.clone())
- .unwrap_or(Charset::default());
+fn gen_level(
+ batches: &mut Vec<Batch>,
+ level: &str,
+ chars: &Charset,
+ done: &Charset,
+ avoid: &Charset,
+ mut examples: Vec<&Example>,
+ mut remainder: Option<Batch>,
+) -> Option<Batch> {
+ examples.shuffle(&mut thread_rng());
- let mut chars_late = Charset::default();
+ println!(
+ "Level {}: {} characters using {} examples",
+ level,
+ chars.len(),
+ examples.len()
+ );
- let mut chars_bad = Charset::from_iter(
- kanji_levels
+ let mut done = remainder.as_ref()
+ .map(|x| x.chars.clone())
+ .unwrap_or_default()
+ .union(&done);
+
+ 'outer: while !examples.is_empty() {
+ println!("iter with {} examples", examples.len());
+ let mut batch = remainder.take().unwrap_or_else(|| Batch {
+ level: level.to_string(),
+ chars: Charset::default(),
+ chars_p1: Charset::default(),
+ chars_p2: Charset::default(),
+ chars_bad: Charset::default(),
+ examples: vec![],
+ });
+ let n_chars = 20 - batch.chars.len();
+
+ let mut dyn_mat: Vec<Vec<Option<(Charset, Option<(usize, usize)>)>>> = vec![];
+ for ex in examples.iter() {
+ let mut dyn_row = vec![None; n_chars + 1];
+ let chars_common = ex.chars.inter(&chars).diff(&done);
+ if chars_common.len() < dyn_row.len() {
+ dyn_row[chars_common.len()] = Some((chars_common.clone(), None));
+ }
+ for (i, dyn_prev) in dyn_mat.iter().enumerate() {
+ for (j, dpr) in dyn_prev.iter().enumerate() {
+ if let Some((chars_inter, _prev)) = dpr {
+ assert_eq!(chars_inter.len(), j);
+ let new_chars_common = chars_inter.union(&chars_common);
+ let new_chars_common_len = new_chars_common.len();
+ if new_chars_common_len > chars_inter.len() && new_chars_common_len <= n_chars {
+ dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j))));
+ }
+ }
+ }
+ }
+ let stop = dyn_row[n_chars].is_some();
+ dyn_mat.push(dyn_row);
+ if stop {
+ break;
+ }
+ }
+ let (mut i, mut j) = (dyn_mat.len() - 1, n_chars);
+ loop {
+ match &dyn_mat[i][j] {
+ None => break 'outer,
+ Some((chars, ij_prev)) => {
+ println!("Add {}: {}", examples[i].chars.inter(&chars).to_string(), examples[i].ja);
+ batch.examples.push(examples[i].clone());
+ examples.remove(i);
+ batch.chars = batch.chars.union(&chars);
+ match ij_prev {
+ Some((iprev, jprev)) => {
+ assert!(*iprev < i);
+ i = *iprev;
+ j = *jprev;
+ }
+ None => break,
+ }
+ }
+ }
+ }
+ assert_eq!(batch.chars.len(), 20);
+
+ batch.chars_p1 = batches
.iter()
- .skip(target_i + 1)
- .map(|(_, c)| c.chars().iter().copied())
- .flatten(),
- );
- let mut chars_bad_avoid = Charset::from_iter(
- kanji_levels
+ .rev()
+ .next()
+ .map(|b| b.chars.clone())
+ .unwrap_or(Charset::default())
+ .inter(&batch.chars);
+
+ batch.chars_p2 = batches
.iter()
- .skip(target_i + 1)
- .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
- .map(|(_, c)| c.chars().iter().copied())
- .flatten(),
- );
+ .rev()
+ .skip(1)
+ .next()
+ .map(|b| b.chars.clone())
+ .unwrap_or(Charset::default())
+ .inter(&batch.chars);
+
+ batch.chars_bad = batch.chars.inter(&avoid);
+ done = done.union(&batch.chars);
+
+ println!("-> batch {:03}: {} with {} examples", batches.len(), batch.chars.to_string(), batch.examples.len());
+ batches.push(batch);
+ }
- let mut batch = Batch {
- level: target_level.to_string(),
- chars: Charset::default(),
- chars_p1: Charset::default(),
- chars_p2: Charset::default(),
- chars_bad: Charset::default(),
- examples: Vec::new(),
- };
- let mut batch_chars = Charset::default();
-
- eprintln!("----");
- eprintln!("Level : {}", batch.level);
- eprintln!("Target : {}", target_chars.to_string());
- eprintln!("Prev1 : {}", chars_p1.to_string());
- eprintln!("Prev2 : {}", chars_p2.to_string());
- eprintln!("Bad : {} characters", chars_bad.len());
-
- let batch_len = 20;
- let mut stalled = false;
- while batch.chars.len() < batch_len && !target_chars.is_empty() {
- let need = batch_len - batch.chars.len();
- let should_add = need > target_chars.len() && target_chars.len() <= 3;
- if target_i + 1 < kanji_levels.len() && (should_add || stalled) {
- // upgrade to next level
- target_i += 1;
- chars_late = chars_late.union(&target_chars);
- target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars));
- chars_bad = chars_bad.diff(&target_chars);
- chars_bad_avoid = chars_bad_avoid.diff(&target_chars);
- if batch.examples.is_empty() {
- batch.level = kanji_levels[target_i].0.to_string();
- } else {
- batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0);
- }
- eprintln!("Level : {}", batch.level);
- eprintln!("Target: {}", target_chars.to_string());
- eprintln!("Late : {}", chars_late.to_string());
- eprintln!("Bad : {} characters", chars_bad.len());
- stalled = false;
- }
- /* this one works well enough
- let cost = |ex: &Example, ex_tgt_inter: usize| {
- 20i32 * ex_tgt_inter as i32
- + 30i32 * ex.chars.inter_len(&chars_late) as i32
- + 6i32 * ex.chars.inter_len(&batch.chars) as i32
- + 4i32 * ex.chars.inter_len(&chars_p1) as i32
- + 3i32 * ex.chars.inter_len(&chars_p2) as i32
- - 40i32 * ex.chars.inter_len(&chars_bad) as i32
- };
- */
- let cost = |ex: &Example, ex_tgt_inter: usize| {
- (
- -(ex.chars.inter_len(&chars_bad_avoid) as i32),
- ex_tgt_inter,
- ex.chars.inter_len(&chars_late),
- 2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
- -(ex.ja.chars().count() as i32),
- )
+ if !examples.is_empty() {
+ let n_remaining = chars.diff(&done).len();
+ println!("still {} examples, {} chars", examples.len(), n_remaining);
+ if n_remaining > 20 {
+ return None;
+ }
+ let ex_chars = Charset::from_iter(
+ examples.iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten()).diff(&done);
+ assert!(ex_chars.len() <= n_remaining);
+ let remainder = Batch {
+ level: level.to_string(),
+ chars: ex_chars,
+ chars_p1: Charset::default(),
+ chars_p2: Charset::default(),
+ chars_bad: Charset::default(),
+ examples: examples.iter().map(|x| (**x).clone()).collect(),
};
- let cand_1 = examples
+ println!("remainer batch: {:?}", remainder);
+ Some(remainder)
+ } else {
+ None
+ }
+}
+
+fn level_examples<'a>(
+ chars: &Charset,
+ avoid: &Charset,
+ all_examples: &'a [Example],
+) -> Vec<&'a Example> {
+ println!("Calculating examples for {}", chars.to_string());
+
+ let mut todo = chars.clone();
+ let mut bad = Charset::default();
+ let mut examples = vec![];
+
+ let cost = |ex: &Example, ex_todo_inter: usize, ex_chars_inter: usize| {
+ (
+ -(ex.chars.inter_len(&avoid) as i32),
+ ex_todo_inter,
+ ex_chars_inter,
+ -(ex.ja.chars().count() as i32),
+ (thread_rng().gen::<usize>())
+ )
+ };
+ let mut all_with_inter = all_examples
+ .par_iter()
+ .map(|ex| (ex, ex.chars.inter_len(&chars)))
+ .map(|(ex, ex_chars_inter)| (ex, ex_chars_inter, ex_chars_inter))
+ .collect::<Vec<_>>();
+
+ while !todo.is_empty() {
+ let best = all_with_inter.par_iter().enumerate()
+ .filter(|(_, (_, ex_todo_inter, _))| (1..=4).contains(ex_todo_inter))
+ .max_by_key(
+ |(_, (ex, ex_todo_inter, ex_chars_inter))| cost(*ex, *ex_todo_inter, *ex_chars_inter),
+ );
+ if let Some((i, (ex, ex_todo_inter, _))) = best {
+ let ex = *ex;
+ assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len());
+ examples.push(ex);
+ all_with_inter.remove(i);
+ todo = todo.diff(&ex.chars);
+ bad = bad.union(&ex.chars.inter(&avoid));
+ all_with_inter
+ .par_iter_mut()
+ .for_each(|(ex2, ex_todo_inter, _)| {
+ if ex2.chars.inter_len(&ex.chars) > 0 {
+ *ex_todo_inter = ex2.chars.inter_len(&todo);
+ }
+ });
+ } else {
+ break;
+ }
+ }
+ if !todo.is_empty() {
+ println!("MISSING: NO SENTENCES FOR {}", todo.to_string());
+ }
+ if !bad.is_empty() {
+ println!("USED BAD CHARS: {}", bad.to_string());
+ }
+ examples
+}
+
+/*
+
+let (mut target_i, target_level, mut target_chars) = kanji_levels
+ .iter()
+ .enumerate()
+ .map(|(i, (l, c))| (i, l, c.diff(&prev_chars)))
+ .find(|(_, _, c)| !c.is_empty())
+ .ok_or(anyhow!("no more batches to make!"))?;
+
+let chars_p1 = previous
+ .iter()
+ .rev()
+ .next()
+ .map(|b| b.chars.clone())
+ .unwrap_or(Charset::default());
+
+let chars_p2 = previous
+ .iter()
+ .rev()
+ .skip(1)
+ .next()
+ .map(|b| b.chars.clone())
+ .unwrap_or(Charset::default());
+
+let mut chars_late = Charset::default();
+
+let mut chars_bad = Charset::from_iter(
+ kanji_levels
+ .iter()
+ .skip(target_i + 1)
+ .map(|(_, c)| c.chars().iter().copied())
+ .flatten(),
+);
+let mut chars_bad_avoid = Charset::from_iter(
+ kanji_levels
+ .iter()
+ .skip(target_i + 1)
+ .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
+ .map(|(_, c)| c.chars().iter().copied())
+ .flatten(),
+);
+
+let mut batch = Batch {
+ level: target_level.to_string(),
+ chars: Charset::default(),
+ chars_p1: Charset::default(),
+ chars_p2: Charset::default(),
+ chars_bad: Charset::default(),
+ examples: Vec::new(),
+};
+let mut batch_chars = Charset::default();
+
+eprintln!("----");
+eprintln!("Level : {}", batch.level);
+eprintln!("Target : {}", target_chars.to_string());
+eprintln!("Prev1 : {}", chars_p1.to_string());
+eprintln!("Prev2 : {}", chars_p2.to_string());
+eprintln!("Bad : {} characters", chars_bad.len());
+
+let batch_len = 20;
+let mut stalled = false;
+while batch.chars.len() < batch_len && !target_chars.is_empty() {
+ let need = batch_len - batch.chars.len();
+ let should_add = need > target_chars.len() && target_chars.len() <= 3;
+ if target_i + 1 < kanji_levels.len() && (should_add || stalled) {
+ // upgrade to next level
+ target_i += 1;
+ chars_late = chars_late.union(&target_chars);
+ target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars));
+ chars_bad = chars_bad.diff(&target_chars);
+ chars_bad_avoid = chars_bad_avoid.diff(&target_chars);
+ if batch.examples.is_empty() {
+ batch.level = kanji_levels[target_i].0.to_string();
+ } else {
+ batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0);
+ }
+ eprintln!("Level : {}", batch.level);
+ eprintln!("Target: {}", target_chars.to_string());
+ eprintln!("Late : {}", chars_late.to_string());
+ eprintln!("Bad : {} characters", chars_bad.len());
+ stalled = false;
+ }
+ /* this one works well enough
+ let cost = |ex: &Example, ex_tgt_inter: usize| {
+ 20i32 * ex_tgt_inter as i32
+ + 30i32 * ex.chars.inter_len(&chars_late) as i32
+ + 6i32 * ex.chars.inter_len(&batch.chars) as i32
+ + 4i32 * ex.chars.inter_len(&chars_p1) as i32
+ + 3i32 * ex.chars.inter_len(&chars_p2) as i32
+ - 40i32 * ex.chars.inter_len(&chars_bad) as i32
+ };
+ */
+ let cost = |ex: &Example, ex_tgt_inter: usize| {
+ (
+ -(ex.chars.inter_len(&chars_bad_avoid) as i32),
+ ex_tgt_inter,
+ ex.chars.inter_len(&chars_late),
+ 2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
+ -(ex.ja.chars().count() as i32),
+ )
+ };
+ let cand_1 = examples
+ .par_iter()
+ .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
+ .filter(|(_, ex_tgt_inter)| {
+ (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len
+ })
+ .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter));
+ let cand = cand_1.or_else(|| {
+ examples
.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
- .filter(|(_, ex_tgt_inter)| {
- (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len
- })
- .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter));
- let cand = cand_1.or_else(|| {
- examples
- .par_iter()
- .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
- .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
- .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
- });
- if let Some((ex, _)) = cand {
+ .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
+ .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
+ });
+ if let Some((ex, _)) = cand {
+ eprintln!(
+ "* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
+ ex.chars.inter(&target_chars).to_string(),
+ ex.chars.inter(&batch.chars).to_string(),
+ ex.chars.inter(&chars_p1).to_string(),
+ ex.chars.inter(&chars_p2).to_string(),
+ ex.chars.inter(&chars_bad).to_string(),
+ ex.ja
+ );
+ batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
+ target_chars = target_chars.diff(&ex.chars);
+ chars_late = chars_late.diff(&ex.chars);
+ batch.examples.push(ex.clone());
+ batch_chars = batch_chars.union(&ex.chars);
+ stalled = false;
+ } else {
+ if stalled {
eprintln!(
- "* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
- ex.chars.inter(&target_chars).to_string(),
- ex.chars.inter(&batch.chars).to_string(),
- ex.chars.inter(&chars_p1).to_string(),
- ex.chars.inter(&chars_p2).to_string(),
- ex.chars.inter(&chars_bad).to_string(),
- ex.ja
+ "could not find suitable sentence, stopping batch now (need {})",
+ need
);
- batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
- target_chars = target_chars.diff(&ex.chars);
- chars_late = chars_late.diff(&ex.chars);
- batch.examples.push(ex.clone());
- batch_chars = batch_chars.union(&ex.chars);
- stalled = false;
- } else {
- if stalled {
- eprintln!(
- "could not find suitable sentence, stopping batch now (need {})",
- need
- );
- break;
- }
- stalled = true;
+ break;
}
+ stalled = true;
}
+}
- batch.chars_p1 = chars_p1.inter(&batch_chars);
- batch.chars_p2 = chars_p2.inter(&batch_chars);
- batch.chars_bad = chars_bad.inter(&batch_chars);
+batch.chars_p1 = chars_p1.inter(&batch_chars);
+batch.chars_p2 = chars_p2.inter(&batch_chars);
+batch.chars_bad = chars_bad.inter(&batch_chars);
- Ok(batch)
-}
+Ok(batch)
+ */
fn simplify_batch(batch: &mut Batch) {
let mut char_cnt = HashMap::<char, usize>::new();