aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-07-21 18:37:09 +0200
committerAlex Auvolat <alex@adnab.me>2023-07-21 18:37:09 +0200
commit41ab1f6eb77b0c3fe24577f2daec901cedb0ad60 (patch)
tree3452fc0290ac78b25957112646d1e5ed244bb3f6 /src
parente220b38123fcecbf4448826f3f0ca2098c89181f (diff)
downloaddatagengo-41ab1f6eb77b0c3fe24577f2daec901cedb0ad60.tar.gz
datagengo-41ab1f6eb77b0c3fe24577f2daec901cedb0ad60.zip
Some batches
Diffstat (limited to 'src')
-rw-r--r--src/main.rs169
1 files changed, 140 insertions, 29 deletions
diff --git a/src/main.rs b/src/main.rs
index d1efece..6c67680 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,9 +1,11 @@
use std::collections::HashMap;
use std::fs;
use std::cmp::Ordering;
-use std::io::{self, BufRead};
+use std::io::{self, BufRead, Write};
use anyhow::{anyhow, Result};
+use rayon::prelude::*;
+use serde::{Serialize, Deserialize};
use structopt::StructOpt;
#[derive(Debug, StructOpt)]
@@ -17,6 +19,7 @@ struct Opt {
enum Cmd {
ParseKanjidic,
New,
+ Format,
}
fn main() {
@@ -28,7 +31,7 @@ fn main() {
for (jlpt, grade, chars) in levels.iter() {
println!("{}.{}: {}", jlpt, grade, chars);
}
- },
+ }
Cmd::New => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(kanji_levels.iter()
@@ -38,12 +41,27 @@ fn main() {
let kanji_levels = kanji_levels.into_iter()
.map(|(l, x)| (l, Charset::new(x)))
.collect::<Vec<_>>();
- let ex = read_examples(&all_kanji).expect("read_examples");
- println!("{:#?}", ex.iter().take(10).collect::<Vec<_>>());
- let batch1 = gen_batch(&[], &kanji_levels, &ex).expect("gen_batch");
- println!("{:#?}", batch1);
- let batch2 = gen_batch(&[batch1], &kanji_levels, &ex).expect("gen_batch");
- println!("{:#?}", batch2);
+ let mut ex = read_examples(&all_kanji).expect("read_examples");
+ ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
+ let mut batches: Vec<Batch> = fs::read("data/batches.json")
+ .map_err(anyhow::Error::from)
+ .and_then(|x| Ok(serde_json::from_slice(&x)?))
+ .unwrap_or_default();
+ println!("---- starting after {} batches ----", batches.len());
+ for _ in 0..10 {
+ let batch = gen_batch(&batches, &kanji_levels, &ex).expect("gen_batch");
+ batches.push(batch);
+ }
+ fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save");
+ }
+ Cmd::Format => {
+ let batches = fs::read("data/batches.json")
+ .map_err(anyhow::Error::from)
+ .and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?))
+ .expect("read/parse");
+ batches.par_iter()
+ .enumerate()
+ .for_each(|x| format_batch(batches.len(), x));
}
}
}
@@ -153,52 +171,142 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
.find(|(_, _, c)| !c.is_empty())
.ok_or(anyhow!("no more batches to make!"))?;
- let chars_4 = kanji_levels[..target_i].iter().rev().next()
- .map(|(_, c)| c.clone()).unwrap_or(Charset::new(""));
+ let chars_p1 = previous.iter().rev().next()
+ .map(|b| b.chars.clone()).unwrap_or(Charset::default());
- let chars_2 = kanji_levels[..target_i].iter().rev().skip(1).next()
- .map(|(_, c)| c.clone()).unwrap_or(Charset::new(""));
+ let chars_p2 = previous.iter().rev().skip(1).next()
+ .map(|b| b.chars.clone()).unwrap_or(Charset::default());
- let chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
+ let mut chars_missing = Charset::default();
+
+ let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
.map(|(_, c)| c.chars().iter().copied())
.flatten());
let mut batch = Batch {
level: target_level.to_string(),
- chars: Charset::new(""),
+ chars: Charset::default(),
+ chars_p1: Charset::default(),
+ chars_p2: Charset::default(),
+ chars_bad: Charset::default(),
examples: Vec::new(),
};
+ let mut batch_chars = Charset::default();
- eprintln!("Target (val=10) : {}", target_chars.to_string());
- eprintln!("Prev1 (val=4) : {}", chars_4.to_string());
- eprintln!("Prev2 (val=2) : {}", chars_2.to_string());
- eprintln!("Bad (val=-10): {}", chars_bad.to_string());
+ eprintln!("----");
+ eprintln!("Level : {}", batch.level);
+ eprintln!("Target : {}", target_chars.to_string());
+ eprintln!("Prev1 : {}", chars_p1.to_string());
+ eprintln!("Prev2 : {}", chars_p2.to_string());
+ eprintln!("Bad : {} characters", chars_bad.len());
let batch_len = 20;
while batch.chars.len() < batch_len && !target_chars.is_empty() {
- if let Some((ex, _)) = examples.iter()
+ let need = batch_len - batch.chars.len();
+ if need >= 2 && target_chars.len() <= 1 && target_i + 1 < kanji_levels.len() {
+ // upgrade to next level
+ target_i += 1;
+ chars_missing = chars_missing.union(&target_chars);
+ target_chars = target_chars.union(&kanji_levels[target_i].1);
+ chars_bad = chars_bad.diff(&target_chars);
+ if batch.examples.is_empty() {
+ batch.level = kanji_levels[target_i].0.to_string();
+ } else {
+ batch.level = format!("{} + {}", batch.level, kanji_levels[target_i].0);
+ }
+ eprintln!("Level : {}", batch.level);
+ eprintln!("Target: {}", target_chars.to_string());
+ eprintln!("Missing: {}", chars_missing.to_string());
+ eprintln!("Bad : {} characters", chars_bad.len());
+ }
+ if let Some((ex, _)) = examples.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
- .filter(|(_, ex_tgt_inter)| *ex_tgt_inter <= 4)
- .filter(|(_, ex_tgt_inter)| *ex_tgt_inter + batch.chars.len() <= batch_len + 1)
+ .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len)
.max_by_key(|(ex, ex_tgt_inter)|
- 10i32 * *ex_tgt_inter as i32
- + 4i32 * ex.chars.inter_len(&chars_4) as i32
- + 2i32 * ex.chars.inter_len(&chars_2) as i32
+ 20i32 * *ex_tgt_inter as i32
+ + 30i32 * ex.chars.inter_len(&chars_missing) as i32
+ + 6i32 * ex.chars.inter_len(&batch.chars) as i32
+ + 4i32 * ex.chars.inter_len(&chars_p1) as i32
+ + 3i32 * ex.chars.inter_len(&chars_p2) as i32
- 40i32 * ex.chars.inter_len(&chars_bad) as i32) {
- println!("add: {:?} (bad: {})", ex, ex.chars.inter(&chars_bad).to_string());
+ eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
+ ex.chars.inter(&target_chars).to_string(),
+ ex.chars.inter(&batch.chars).to_string(),
+ ex.chars.inter(&chars_p1).to_string(),
+ ex.chars.inter(&chars_p2).to_string(),
+ ex.chars.inter(&chars_bad).to_string(),
+ ex.ja);
batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
target_chars = target_chars.diff(&ex.chars);
+ chars_missing = chars_missing.diff(&ex.chars);
batch.examples.push(ex.clone());
+ batch_chars = batch_chars.union(&ex.chars);
} else {
- eprintln!("could not find sentence that doesn't add myriads too many characters, stopping batch now");
+ eprintln!("could not find suitable sentence, stopping batch now (missing {})", need);
break;
}
}
+ batch.chars_p1 = chars_p1.inter(&batch_chars);
+ batch.chars_p2 = chars_p2.inter(&batch_chars);
+ batch.chars_bad = chars_bad.inter(&batch_chars);
+
Ok(batch)
}
-#[derive(Debug, Clone)]
+fn format_batch(count: usize, (i, batch): (usize, &Batch)) {
+ format_batch_aux(count, i, batch).expect("format batch");
+}
+
+fn format_batch_aux(count: usize, i: usize, batch: &Batch) -> Result<()> {
+ let mut f = io::BufWriter::new(fs::File::create(format!("html/{:03}.html", i))?);
+ write!(f, r#"<!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset=\"UTF-8\" />
+ <title>Batch #{:03}</title>
+ <link rel="stylesheet" type="text/css" href="style.css" />
+ </head>
+ <body>"#, i)?;
+
+ writeln!(f, "<p>")?;
+ for j in 0..count {
+ if j != i {
+ writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
+ } else {
+ writeln!(f, " {:03}", j)?;
+ }
+ }
+ writeln!(f, r#"</p>"#)?;
+ writeln!(f, "<p>Level: {}</p>", batch.level)?;
+ writeln!(f, "<p>Characters: {}</p>", batch.chars.to_string())?;
+
+ for ex in batch.examples.iter() {
+ writeln!(f, "<hr />")?;
+ write!(f, r#"<p class="ja">"#)?;
+ for c in ex.ja.chars() {
+ if batch.chars.contains(c) {
+ write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
+ } else if batch.chars_p1.contains(c) {
+ write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
+ } else if batch.chars_p2.contains(c) {
+ write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
+ } else if batch.chars_bad.contains(c) {
+ write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
+ } else {
+ write!(f, "{}", c)?;
+ }
+ }
+ writeln!(f, "</p>")?;
+ writeln!(f, r#"<p style="text-align: center; font-size: 1.2em">{}</p>"#, ex.en)?;
+ }
+
+ write!(f, "</body></html>")?;
+ f.flush()?;
+ Ok(())
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
struct Example {
ja: String,
en: String,
@@ -207,14 +315,17 @@ struct Example {
chars: Charset,
}
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
struct Batch {
level: String,
chars: Charset,
+ chars_p1: Charset,
+ chars_p2: Charset,
+ chars_bad: Charset,
examples: Vec<Example>,
}
-#[derive(Debug, Eq, PartialEq, Hash, Clone)]
+#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)]
struct Charset(Vec<char>);
impl Charset {