aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-09-25 11:52:53 +0200
committerAlex Auvolat <alex@adnab.me>2023-09-25 11:52:53 +0200
commit38f5510ad3f372e97e57be738f9c99d887db0037 (patch)
treef067cf119996695082eeb42b8770fd12a0828ffb /src
parentecb42da464e00c9084c23a1ecf96e7be1b97dd25 (diff)
downloaddatagengo-38f5510ad3f372e97e57be738f9c99d887db0037.tar.gz
datagengo-38f5510ad3f372e97e57be738f9c99d887db0037.zip
new algorith seems to be working well
Diffstat (limited to 'src')
-rw-r--r--src/main.rs336
1 files changed, 211 insertions, 125 deletions
diff --git a/src/main.rs b/src/main.rs
index a2c9488..335add8 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,8 +2,8 @@ use std::collections::HashMap;
use std::fs;
use std::io::{self, BufRead, Write};
+use anyhow::Result;
use rand::prelude::*;
-use anyhow::{anyhow, Result};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
@@ -28,10 +28,7 @@ enum Cmd {
truncate: Option<usize>,
},
Simplify,
- Rebalance {
- #[structopt(default_value = "5")]
- start: usize,
- },
+ Cleanup,
Format,
}
@@ -94,15 +91,17 @@ fn main() {
)
.expect("save");
}
- Cmd::Rebalance { start } => {
+ Cmd::Cleanup => {
let mut batches: Vec<Batch> = fs::read("data/batches.json")
.map_err(anyhow::Error::from)
.and_then(|x| Ok(serde_json::from_slice(&x)?))
.unwrap_or_default();
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
- for (level, _) in kanji_levels.iter() {
- rebalance_level(level, &mut batches[start..]);
- }
+ let kanji_levels = kanji_levels
+ .into_iter()
+ .map(|(l, x)| (l, Charset::new(x)))
+ .collect::<Vec<_>>();
+ cleanup_batches(&mut batches, &kanji_levels);
fs::write(
"data/batches.json",
serde_json::to_string_pretty(&batches)
@@ -293,7 +292,10 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
// BATCH STRUCTURES AND GENERATION
// =====================================================================
-#[derive(Debug, Clone, Serialize, Deserialize)]
+const CHARS_PER_BATCH: usize = 20;
+const MAX_NEW_CHARS_PER_EX: usize = 5;
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct Example {
ja: String,
en: String,
@@ -302,7 +304,7 @@ struct Example {
chars: Charset,
}
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
struct Batch {
level: String,
chars: Charset,
@@ -324,14 +326,19 @@ fn gen_batches(
batches
.iter()
.map(|x| x.chars.chars().iter().copied())
- .flatten()
- .chain(remainder.as_ref().map(|x: &Batch| x.chars.chars())
- .unwrap_or_default().iter().copied()),
+ .flatten(),
);
+ let remainder_chars = remainder
+ .as_ref()
+ .map(|x: &Batch| x.chars.clone())
+ .unwrap_or_default();
+
+ let remainder_before = remainder.clone();
+ let len_before = batches.len();
let mut advanced = false;
- for (i, (level, chars)) in kanji_levels.iter().enumerate() {
- let diff = chars.diff(&done);
+ for (i, (level, level_chars)) in kanji_levels.iter().enumerate() {
+ let diff = level_chars.diff(&done).diff(&remainder_chars);
if !diff.is_empty() {
let avoid = Charset::from_iter(
kanji_levels
@@ -343,15 +350,54 @@ fn gen_batches(
);
let level_examples = level_examples(&diff, &avoid, examples);
+ let level_new_chars = Charset::from_iter(
+ level_examples
+ .iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten(),
+ )
+ .inter(&diff);
+ println!(
+ "- {} ({} chars): {} done previously, {} diff, {} ex, {} new chars",
+ level,
+ level_chars.len(),
+ done.len(),
+ diff.len(),
+ level_examples.len(),
+ level_new_chars.len()
+ );
if !level_examples.is_empty() {
- remainder = gen_level(batches, level, &diff, &done, &avoid, level_examples, remainder);
+ assert!(!level_new_chars.is_empty());
+ remainder = gen_level(
+ batches,
+ level,
+ &level_new_chars,
+ &done,
+ level_examples,
+ remainder,
+ );
advanced = true;
break;
}
}
}
+ if let Some(r) = &remainder {
+ assert!(r.examples.len() <= 20);
+ }
- if !advanced {
+ if advanced && batches.len() == len_before && remainder == remainder_before {
+ // restart level with new rng
+ let last_level = batches.last().unwrap().level.to_string();
+ println!("RESTARTING LEVEL {}, hopefully new RNG", last_level);
+ while batches
+ .last()
+ .map(|x| x.level == last_level)
+ .unwrap_or(false)
+ {
+ batches.pop();
+ remainder = None;
+ }
+ } else if !advanced {
break;
}
}
@@ -365,45 +411,82 @@ fn gen_batches(
fn gen_level(
batches: &mut Vec<Batch>,
level: &str,
- chars: &Charset,
- done: &Charset,
- avoid: &Charset,
+ new_chars: &Charset,
+ prev_done: &Charset,
mut examples: Vec<&Example>,
mut remainder: Option<Batch>,
) -> Option<Batch> {
examples.shuffle(&mut thread_rng());
+ let remainder_chars = remainder.as_ref().map(|x| x.chars.len()).unwrap_or(0);
println!(
- "Level {}: {} characters using {} examples",
+ "Level {}: {} characters using {} examples, remainder has {} chars and {} examples",
level,
- chars.len(),
- examples.len()
+ new_chars.len(),
+ examples.len(),
+ remainder_chars,
+ remainder.as_ref().map(|x| x.examples.len()).unwrap_or(0),
);
- let avg_len = examples.len() as f32 * 20. / chars.len() as f32;
+ let avg_len = examples.len() as f32 * CHARS_PER_BATCH as f32 / new_chars.len() as f32;
+ let mut batch_count = 0;
+ let mut sum_len = 0;
- let mut done = remainder.as_ref()
- .map(|x| x.chars.clone())
- .unwrap_or_default()
- .union(&done);
+ let mut done = prev_done.union(
+ remainder
+ .as_ref()
+ .map(|x| &x.chars)
+ .unwrap_or(&Charset::default()),
+ );
- 'outer: while !examples.is_empty() {
+ loop {
println!("iter with {} examples", examples.len());
let mut batch = remainder.take().unwrap_or_else(|| Batch {
level: level.to_string(),
- chars: Charset::default(),
- chars_p1: Charset::default(),
- chars_p2: Charset::default(),
- chars_bad: Charset::default(),
- examples: vec![],
+ ..Default::default()
});
- let n_chars = 20 - batch.chars.len();
+ let remaining_chars = new_chars.diff(&done);
+ let todo_chars = CHARS_PER_BATCH - batch.chars.len();
+ if remaining_chars.len() <= todo_chars {
+ for ex in examples.iter() {
+ batch.examples.push((*ex).clone());
+ batch.chars = batch.chars.union(&ex.chars.diff(&done).inter(&new_chars));
+ }
+ if batch.chars.len() == CHARS_PER_BATCH {
+ println!(
+ "-> all remaining examples sum up to exaclty {} chars",
+ CHARS_PER_BATCH
+ );
+ batches.push(batch);
+ return None;
+ } else if batch.examples.is_empty() {
+ assert!(batch.chars.is_empty());
+ println!("-> done");
+ return None;
+ } else {
+ assert!(batch.chars.len() < CHARS_PER_BATCH);
+ println!(
+ "-> with all remaining examples, cannot make a full batch, only {} chars",
+ batch.chars.len()
+ );
+ return Some(batch);
+ }
+ }
+ assert!(!examples.is_empty());
+ println!(
+ "Trying to add exactly {} characters, using {} examples containing {} new chars",
+ todo_chars,
+ examples.len(),
+ remaining_chars.len()
+ );
+
+ // Compute dynamic algorithm matrix with a bunch of combinations that add `todo_chars`
let mut dyn_mat: Vec<Vec<Option<(Charset, Option<(usize, usize)>)>>> = vec![];
for ex in examples.iter() {
- let mut dyn_row = vec![None; n_chars + 1];
- let chars_common = ex.chars.inter(&chars).diff(&done);
- if chars_common.len() > 6 {
+ let mut dyn_row = vec![None; todo_chars + 1];
+ let chars_common = ex.chars.inter(&new_chars).diff(&done);
+ if chars_common.len() > MAX_NEW_CHARS_PER_EX {
dyn_mat.push(dyn_row);
continue;
}
@@ -416,7 +499,9 @@ fn gen_level(
assert_eq!(chars_inter.len(), j);
let new_chars_common = chars_inter.union(&chars_common);
let new_chars_common_len = new_chars_common.len();
- if new_chars_common_len > chars_inter.len() && new_chars_common_len <= n_chars {
+ if new_chars_common_len > chars_inter.len()
+ && new_chars_common_len <= todo_chars
+ {
dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j))));
}
}
@@ -424,13 +509,21 @@ fn gen_level(
}
dyn_mat.push(dyn_row);
}
+
+ // Find combination that does that with a good number of examples (tgt_len)
+ let factor = match level {
+ "N1b" => 1.08,
+ _ => 1.0,
+ };
+ let tgt_len = (avg_len * factor * (batch_count as f32 + 1.)).ceil() as i64
+ - (sum_len + batch.examples.len()) as i64;
let dyn_mat_cnt = |i| {
- let mut i: usize = i;
let mut cnt = 0;
- let mut j: usize = n_chars;
+ let mut i: usize = i;
+ let mut j: usize = todo_chars;
loop {
match &dyn_mat[i][j] {
- None => return 1000000,
+ None => return None,
Some((_, ij_prev)) => {
cnt += 1;
match ij_prev {
@@ -438,22 +531,40 @@ fn gen_level(
i = *iprev;
j = *jprev;
}
- None => return cnt,
+ None => return Some(cnt),
}
}
}
}
};
- let i = (0..dyn_mat.len()).min_by_key(|i| {
- let x = dyn_mat_cnt(*i) as i64 - (avg_len as i64 + 2);
- x * x
- }).unwrap();
- let (mut i, mut j) = (i, n_chars);
+ let i_opt = (0..dyn_mat.len())
+ .filter_map(|pos| dyn_mat_cnt(pos).map(|cnt| (pos, cnt)))
+ .min_by_key(|(_, cnt)| {
+ let x = *cnt as i64 - tgt_len;
+ x * x
+ });
+ let i = match i_opt {
+ None => {
+ println!(
+ "WARNING: cannot make exactly {} chars, interrupting",
+ todo_chars
+ );
+ return None;
+ }
+ Some((pos, _)) => pos,
+ };
+
+ // Take all examples from that combination and add them to current batch
+ let (mut i, mut j) = (i, todo_chars);
loop {
match &dyn_mat[i][j] {
- None => break 'outer,
+ None => panic!("dyn_mat[{}][{}] == None", i, j),
Some((chars, ij_prev)) => {
- println!("Add {}: {}", examples[i].chars.inter(&chars).to_string(), examples[i].ja);
+ println!(
+ "Add {}: {}",
+ examples[i].chars.inter(&chars).to_string(),
+ examples[i].ja
+ );
batch.examples.push(examples[i].clone());
examples.remove(i);
batch.chars = batch.chars.union(&chars);
@@ -468,56 +579,19 @@ fn gen_level(
}
}
}
- assert_eq!(batch.chars.len(), 20);
-
- batch.chars_p1 = batches
- .iter()
- .rev()
- .next()
- .map(|b| b.chars.clone())
- .unwrap_or(Charset::default())
- .inter(&batch.chars);
+ assert_eq!(batch.chars.len(), CHARS_PER_BATCH);
- batch.chars_p2 = batches
- .iter()
- .rev()
- .skip(1)
- .next()
- .map(|b| b.chars.clone())
- .unwrap_or(Charset::default())
- .inter(&batch.chars);
-
- batch.chars_bad = batch.chars.inter(&avoid);
+ println!(
+ "-> batch {:03}: {} with {} examples",
+ batches.len(),
+ batch.chars.to_string(),
+ batch.examples.len()
+ );
+ batch_count += 1;
done = done.union(&batch.chars);
-
- println!("-> batch {:03}: {} with {} examples", batches.len(), batch.chars.to_string(), batch.examples.len());
+ sum_len += batch.examples.len();
batches.push(batch);
}
-
- if !examples.is_empty() {
- let n_remaining = chars.diff(&done).len();
- let ex_chars = Charset::from_iter(
- examples.iter()
- .map(|x| x.chars.chars().iter().copied())
- .flatten()).diff(&done);
- println!("still {} examples, {} chars in ex, {} total", examples.len(), ex_chars.len(), n_remaining);
- assert!(ex_chars.len() <= n_remaining);
- if ex_chars.len() > 20 {
- return None;
- }
- let remainder = Batch {
- level: level.to_string(),
- chars: ex_chars,
- chars_p1: Charset::default(),
- chars_p2: Charset::default(),
- chars_bad: Charset::default(),
- examples: examples.iter().map(|x| (**x).clone()).collect(),
- };
- println!("remainer batch: {:?}", remainder);
- Some(remainder)
- } else {
- None
- }
}
fn level_examples<'a>(
@@ -537,7 +611,7 @@ fn level_examples<'a>(
ex_todo_inter,
ex_chars_inter,
-(ex.ja.chars().count() as i32),
- (thread_rng().gen::<usize>())
+ ex.chars.len() + thread_rng().gen_range(0..5),
)
};
let mut all_with_inter = all_examples
@@ -547,11 +621,14 @@ fn level_examples<'a>(
.collect::<Vec<_>>();
while !todo.is_empty() {
- let best = all_with_inter.par_iter().enumerate()
- .filter(|(_, (_, ex_todo_inter, ex_tgt_inter))| (1..=6).contains(ex_todo_inter))
- .max_by_key(
- |(_, (ex, ex_todo_inter, ex_chars_inter))| cost(*ex, *ex_todo_inter, *ex_chars_inter),
- );
+ let best = all_with_inter
+ .par_iter()
+ .enumerate()
+ .filter(|(_, (_, ex_todo_inter, _))| *ex_todo_inter > 0)
+ //.filter(|(_, (_, _, ex_tgt_inter))| (1..=8).contains(ex_tgt_inter))
+ .max_by_key(|(_, (ex, ex_todo_inter, ex_chars_inter))| {
+ cost(*ex, *ex_todo_inter, *ex_chars_inter)
+ });
if let Some((i, (ex, ex_todo_inter, _))) = best {
let ex = *ex;
assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len());
@@ -759,26 +836,34 @@ fn simplify_batch(batch: &mut Batch) {
}
}
-fn rebalance_level(level: &str, batches: &mut [Batch]) {
- let mut i_batch = vec![];
- let mut n_ex = 0;
- for (i, b) in batches.iter().enumerate() {
- if b.level == level {
- i_batch.push(i);
- n_ex += b.examples.len();
- }
- }
- if i_batch.len() < 2 {
- return;
+fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]) {
+ let mut chars_p1 = Charset::default();
+ let mut chars_p2 = Charset::default();
+ let mut done = Charset::default();
+
+ for batch in all_batches.iter_mut() {
+ let all_chars = Charset::from_iter(
+ batch
+ .examples
+ .iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten(),
+ );
+
+ batch.level = kanji_levels
+ .iter()
+ .filter(|(_, chars)| chars.inter_len(&batch.chars) > 0)
+ .map(|(lvl, _)| lvl.to_string())
+ .collect::<Vec<_>>()
+ .join("/");
+ done = done.union(&batch.chars);
+ batch.chars_bad = all_chars.diff(&done);
+ batch.chars_p1 = all_chars.inter(&chars_p1);
+ batch.chars_p2 = all_chars.inter(&chars_p2);
+
+ chars_p2 = chars_p1;
+ chars_p1 = batch.chars.clone();
}
- println!(
- "Level {}: {} batches, {} examples, avg {:.2}",
- level,
- i_batch.len(),
- n_ex,
- n_ex as f32 / i_batch.len() as f32
- );
- //todo!()
}
// =====================================================================
@@ -941,17 +1026,18 @@ fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<
)?;
writeln!(f, "<table>")?;
- writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?;
+ writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>B-1</th><th>B-2</th><th>Ignore</th></tr>")?;
for (i, batch) in batches.iter().enumerate() {
writeln!(
f,
- r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td>&nbsp;&nbsp;{}</td><td>{}</td><td>{}</td></tr>"#,
+ r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td>&nbsp;&nbsp;{}</td><td>{}</td><td>{}</td><td>{}</td></tr>"#,
i,
i,
batch.level,
batch.chars.to_string(),
batch.examples.len(),
- batch.chars_p1.union(&batch.chars_p2).to_string(),
+ batch.chars_p1.to_string(),
+ batch.chars_p2.to_string(),
batch.chars_bad.to_string()
)?;
}