diff options
author | Alex Auvolat <alex@adnab.me> | 2023-07-21 18:37:09 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-07-21 18:37:09 +0200 |
commit | 41ab1f6eb77b0c3fe24577f2daec901cedb0ad60 (patch) | |
tree | 3452fc0290ac78b25957112646d1e5ed244bb3f6 /src | |
parent | e220b38123fcecbf4448826f3f0ca2098c89181f (diff) | |
download | datagengo-41ab1f6eb77b0c3fe24577f2daec901cedb0ad60.tar.gz datagengo-41ab1f6eb77b0c3fe24577f2daec901cedb0ad60.zip |
Some batches
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 169 |
1 files changed, 140 insertions, 29 deletions
diff --git a/src/main.rs b/src/main.rs index d1efece..6c67680 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,11 @@ use std::collections::HashMap; use std::fs; use std::cmp::Ordering; -use std::io::{self, BufRead}; +use std::io::{self, BufRead, Write}; use anyhow::{anyhow, Result}; +use rayon::prelude::*; +use serde::{Serialize, Deserialize}; use structopt::StructOpt; #[derive(Debug, StructOpt)] @@ -17,6 +19,7 @@ struct Opt { enum Cmd { ParseKanjidic, New, + Format, } fn main() { @@ -28,7 +31,7 @@ fn main() { for (jlpt, grade, chars) in levels.iter() { println!("{}.{}: {}", jlpt, grade, chars); } - }, + } Cmd::New => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new(kanji_levels.iter() @@ -38,12 +41,27 @@ fn main() { let kanji_levels = kanji_levels.into_iter() .map(|(l, x)| (l, Charset::new(x))) .collect::<Vec<_>>(); - let ex = read_examples(&all_kanji).expect("read_examples"); - println!("{:#?}", ex.iter().take(10).collect::<Vec<_>>()); - let batch1 = gen_batch(&[], &kanji_levels, &ex).expect("gen_batch"); - println!("{:#?}", batch1); - let batch2 = gen_batch(&[batch1], &kanji_levels, &ex).expect("gen_batch"); - println!("{:#?}", batch2); + let mut ex = read_examples(&all_kanji).expect("read_examples"); + ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); + let mut batches: Vec<Batch> = fs::read("data/batches.json") + .map_err(anyhow::Error::from) + .and_then(|x| Ok(serde_json::from_slice(&x)?)) + .unwrap_or_default(); + println!("---- starting after {} batches ----", batches.len()); + for _ in 0..10 { + let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch"); + batches.push(batch); + } + fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save"); + } + Cmd::Format => { + let batches = fs::read("data/batches.json") + .map_err(anyhow::Error::from) + .and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?)) + .expect("read/parse"); + batches.par_iter() + .enumerate() + .for_each(|x| format_batch(batches.len(), x)); } } } @@ -153,52 +171,142 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & .find(|(_, _, c)| !c.is_empty()) .ok_or(anyhow!("no more batches to make!"))?; - let chars_4 = kanji_levels[..target_i].iter().rev().next() - .map(|(_, c)| c.clone()).unwrap_or(Charset::new("")); + let chars_p1 = previous.iter().rev().next() + .map(|b| b.chars.clone()).unwrap_or(Charset::default()); - let chars_2 = kanji_levels[..target_i].iter().rev().skip(1).next() - .map(|(_, c)| c.clone()).unwrap_or(Charset::new("")); + let chars_p2 = previous.iter().rev().skip(1).next() + .map(|b| b.chars.clone()).unwrap_or(Charset::default()); - let chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1) + let mut chars_missing = Charset::default(); + + let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1) .map(|(_, c)| c.chars().iter().copied()) .flatten()); let mut batch = Batch { level: target_level.to_string(), - chars: Charset::new(""), + chars: Charset::default(), + chars_p1: Charset::default(), + chars_p2: Charset::default(), + chars_bad: Charset::default(), examples: Vec::new(), }; + let mut batch_chars = Charset::default(); - eprintln!("Target (val=10) : {}", target_chars.to_string()); - eprintln!("Prev1 (val=4) : {}", chars_4.to_string()); - eprintln!("Prev2 (val=2) : {}", chars_2.to_string()); - eprintln!("Bad (val=-10): {}", chars_bad.to_string()); + eprintln!("----"); + eprintln!("Level : {}", batch.level); + eprintln!("Target : {}", target_chars.to_string()); + eprintln!("Prev1 : {}", chars_p1.to_string()); + eprintln!("Prev2 : {}", chars_p2.to_string()); + eprintln!("Bad : {} characters", chars_bad.len()); let batch_len = 20; while batch.chars.len() < batch_len && !target_chars.is_empty() { - if let Some((ex, _)) = examples.iter() + let need = batch_len - batch.chars.len(); + if need >= 2 && target_chars.len() <= 1 && target_i + 1 < kanji_levels.len() { + // upgrade to next level + target_i += 1; + chars_missing = chars_missing.union(&target_chars); + target_chars = target_chars.union(&kanji_levels[target_i].1); + chars_bad = chars_bad.diff(&target_chars); + if batch.examples.is_empty() { + batch.level = kanji_levels[target_i].0.to_string(); + } else { + batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0); + } + eprintln!("Level : {}", batch.level); + eprintln!("Target: {}", target_chars.to_string()); + eprintln!("Missing: {}", chars_missing.to_string()); + eprintln!("Bad : {} characters", chars_bad.len()); + } + if let Some((ex, _)) = examples.par_iter() .map(|ex| (ex, ex.chars.inter_len(&target_chars))) - .filter(|(_, ex_tgt_inter)| *ex_tgt_inter <= 4) - .filter(|(_, ex_tgt_inter)| *ex_tgt_inter + batch.chars.len() <= batch_len + 1) + .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len) .max_by_key(|(ex, ex_tgt_inter)| - 10i32 * *ex_tgt_inter as i32 - + 4i32 * ex.chars.inter_len(&chars_4) as i32 - + 2i32 * ex.chars.inter_len(&chars_2) as i32 + 20i32 * *ex_tgt_inter as i32 + + 30i32 * ex.chars.inter_len(&chars_missing) as i32 + + 6i32 * ex.chars.inter_len(&batch.chars) as i32 + + 4i32 * ex.chars.inter_len(&chars_p1) as i32 + + 3i32 * ex.chars.inter_len(&chars_p2) as i32 - 40i32 * ex.chars.inter_len(&chars_bad) as i32) { - println!("add: {:?} (bad: {})", ex, ex.chars.inter(&chars_bad).to_string()); + eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}", + ex.chars.inter(&target_chars).to_string(), + ex.chars.inter(&batch.chars).to_string(), + ex.chars.inter(&chars_p1).to_string(), + ex.chars.inter(&chars_p2).to_string(), + ex.chars.inter(&chars_bad).to_string(), + ex.ja); batch.chars = batch.chars.union(&ex.chars.inter(&target_chars)); target_chars = target_chars.diff(&ex.chars); + chars_missing = chars_missing.diff(&ex.chars); batch.examples.push(ex.clone()); + batch_chars = batch_chars.union(&ex.chars); } else { - eprintln!("could not find sentence that doesn't add myriads too many characters, stopping batch now"); + eprintln!("could not find suitable sentence, stopping batch now (missing {})", need); break; } } + batch.chars_p1 = chars_p1.inter(&batch_chars); + batch.chars_p2 = chars_p2.inter(&batch_chars); + batch.chars_bad = chars_bad.inter(&batch_chars); + Ok(batch) } -#[derive(Debug, Clone)] +fn format_batch(count: usize, (i, batch): (usize, &Batch)) { + format_batch_aux(count, i, batch).expect("format batch"); +} + +fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> { + let mut f = io::BufWriter::new(fs::File::create(format!("html/{:03}.html", i))?); + write!(f, r#"<!DOCTYPE html> + <html> + <head> + <meta charset=\"UTF-8\" /> + <title>Batch #{:03}</title> + <link rel="stylesheet" type="text/css" href="style.css" /> + </head> + <body>"#, i)?; + + writeln!(f, "<p>")?; + for j in 0..count { + if j != i { + writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?; + } else { + writeln!(f, " {:03}", j)?; + } + } + writeln!(f, r#"</p>"#)?; + writeln!(f, "<p>Level: {}</p>", batch.level)?; + writeln!(f, "<p>Characters: {}</p>", batch.chars.to_string())?; + + for ex in batch.examples.iter() { + writeln!(f, "<hr />")?; + write!(f, r#"<p class="ja">"#)?; + for c in ex.ja.chars() { + if batch.chars.contains(c) { + write!(f, r#"<span class="char_cur">{}</span>"#, c)?; + } else if batch.chars_p1.contains(c) { + write!(f, r#"<span class="char_p1">{}</span>"#, c)?; + } else if batch.chars_p2.contains(c) { + write!(f, r#"<span class="char_p2">{}</span>"#, c)?; + } else if batch.chars_bad.contains(c) { + write!(f, r#"<span class="char_bad">{}</span>"#, c)?; + } else { + write!(f, "{}", c)?; + } + } + writeln!(f, "</p>")?; + writeln!(f, r#"<p style="text-align: center; font-size: 1.2em">{}</p>"#, ex.en)?; + } + + write!(f, "</body></html>")?; + f.flush()?; + Ok(()) +} + +#[derive(Debug, Clone, Serialize, Deserialize)] struct Example { ja: String, en: String, @@ -207,14 +315,17 @@ struct Example { chars: Charset, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] struct Batch { level: String, chars: Charset, + chars_p1: Charset, + chars_p2: Charset, + chars_bad: Charset, examples: Vec<Example>, } -#[derive(Debug, Eq, PartialEq, Hash, Clone)] +#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)] struct Charset(Vec<char>); impl Charset { |