aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2024-03-10 20:20:34 +0100
committerAlex Auvolat <alex@adnab.me>2024-03-10 20:20:34 +0100
commitd80afc4b77a2d34272e29914280859c836d9efd0 (patch)
treee227bbb0be2342a8b2fa24c33c9ee8763b291bf8
parenta6507a7a77cf8d6f002d58fbf1362d4c0fba1276 (diff)
downloaddatagengo-d80afc4b77a2d34272e29914280859c836d9efd0.tar.gz
datagengo-d80afc4b77a2d34272e29914280859c836d9efd0.zip
reservoir sampling one by one (slow, but works)
-rw-r--r--src/datafiles.rs3
-rw-r--r--src/format.rs12
-rw-r--r--src/main.rs177
-rw-r--r--src/server.rs150
4 files changed, 150 insertions, 192 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs
index 3065fbf..7a6a5d5 100644
--- a/src/datafiles.rs
+++ b/src/datafiles.rs
@@ -22,7 +22,8 @@ pub struct Example {
// PARSING DATA FILES
// =====================================================================
-pub type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
+pub type DictEntry<'a> = roxmltree::Node<'a, 'a>;
+pub type DictIndex<'a> = HashMap<&'a str, Vec<DictEntry<'a>>>;
pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
let dict = dict
.root()
diff --git a/src/format.rs b/src/format.rs
index 3519b13..801611d 100644
--- a/src/format.rs
+++ b/src/format.rs
@@ -154,17 +154,9 @@ pub fn format_batch_to<'a>(
writeln!(
buf,
- r#"<p><strong>Extra examples (reading practice)</strong></p><table class="extratable">"#
+ r#"<p><strong>Extra examples (reading practice)</strong></p>"#
)?;
- for ex in batch.extra_examples.iter() {
- writeln!(
- buf,
- r#"<tr><td><div class="extra_example"><div class="extra_ja font_ja">{}</div><div class="extra_en">{}</div></div></td></tr>"#,
- ex.furigana_markup(),
- ex.en
- )?;
- }
- writeln!(buf, r#"</table>"#)?;
+ // TODO
writeln!(buf, "<hr />")?;
writeln!(buf, "<p>\(≧▽≦)/</p>")?;
diff --git a/src/main.rs b/src/main.rs
index 252740b..1ad5e77 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -37,8 +37,6 @@ enum Cmd {
Simplify,
Cleanup,
AddVocab,
- AddExamples,
- AddFurigana,
Format,
Server,
}
@@ -116,53 +114,6 @@ async fn main() {
save_batches(batches).expect("save_batches");
}
- Cmd::AddExamples => {
- let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
- let all_kanji = Charset::new(
- kanji_levels
- .iter()
- .map(|(_, x)| x.to_string())
- .collect::<Vec<_>>()
- .join(""),
- );
-
- let mut ex = read_examples(&all_kanji).expect("read_examples");
- ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
-
- let mut batches = read_batches().expect("read_batches");
-
- add_extra_examples(&mut batches, &ex);
-
- save_batches(batches).expect("save_batches");
- }
- Cmd::AddFurigana => {
- let mut batches = read_batches().expect("read_batches");
-
- let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
- let jmdict = roxmltree::Document::parse_with_options(
- &jmdict,
- roxmltree::ParsingOptions {
- allow_dtd: true,
- ..Default::default()
- },
- )
- .expect("parse_jmdict");
- let jmdict_idx = index_jmdict(&jmdict);
-
- let overrides = read_furigana_overrides().expect("read_furigana_overrides");
-
- for batch in batches.iter_mut() {
- for ex in batch
- .examples
- .iter_mut()
- .chain(batch.extra_examples.iter_mut())
- {
- ex.gen_furigana(&jmdict_idx, &overrides);
- }
- }
-
- save_batches(batches).expect("save_batches");
- }
Cmd::Format => {
let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict = roxmltree::Document::parse_with_options(
@@ -229,8 +180,6 @@ pub struct Batch {
pub examples: Vec<Example>,
#[serde(default)]
pub extra_vocab: Vec<JlptVocab>,
- #[serde(default)]
- pub extra_examples: Vec<Example>,
}
fn gen_batches(
@@ -688,129 +637,3 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
batch.extra_vocab = vocab;
}
}
-
-fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
- let mut chars = Charset::default();
- let mut char_seen_count: HashMap<char, usize> = HashMap::new();
-
- for (i, batch) in all_batches.iter_mut().enumerate() {
- println!("---- BATCH #{:03} ----", i);
- chars = chars.union(&batch.chars);
-
- // Count characters in batch in char_seen_count as a lot
- for ex in batch.examples.iter() {
- for c in ex.chars.iter() {
- *char_seen_count.entry(c).or_default() += 5;
- }
- }
-
- // Take only examples that:
- // - contain kanji of this batch
- // - only contain kanji of this or previous batches
- // - are not in the batch's main example sentences
- let candidates = examples
- .iter()
- .filter(|x| x.chars.inter_len(&batch.chars) > 0)
- .filter(|x| x.chars.diff(&chars).len() == 0)
- .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja));
-
- // Take only one candidate sentence for each possible set of represented kanji
- let mut cand_by_chars = HashMap::new();
- for c in candidates {
- cand_by_chars.insert(c.chars.to_string(), c.clone());
- }
- let mut candidates = cand_by_chars
- .into_iter()
- .map(|(_, ex)| ex)
- .collect::<Vec<_>>();
-
- // Sorte candidates in a deterministic random order
- candidates.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
-
- batch.extra_examples.clear();
-
- let mut batch_char_seen_count: HashMap<char, usize> = HashMap::new();
- let mut in_batch =
- Charset::from_iter(batch.examples.iter().map(|x| x.chars.iter()).flatten());
- let mut in_batch_extra = Charset::default();
-
- while batch.extra_examples.len() < 40 {
- let batch_min_seen = batch
- .chars
- .iter()
- .map(|x| batch_char_seen_count.get(&x).copied().unwrap_or(0))
- .min()
- .unwrap();
- // Target chars: chars of the batch that have the less examples
- let c0 =
- Charset::from_iter(batch.chars.iter().filter(|x| {
- batch_char_seen_count.get(x).copied().unwrap_or(0) == batch_min_seen
- }));
- // Target chars: chars that have been seen less than cnt times
- let fc = |cnt| {
- Charset::from_iter(
- chars
- .iter()
- .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt),
- )
- };
- let c1 = fc(5);
- let c2 = fc(6);
- let c3 = fc(7);
- let c4 = fc(10);
-
- let best = candidates
- .iter()
- .enumerate()
- .filter(|(_, ex)| {
- batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0
- })
- .map(|(i, ex)| {
- let weight = (
- ex.chars.inter_len(&c0),
- ex.chars.inter_len(&c1),
- ex.chars.inter_len(&c2),
- ex.chars.inter_len(&c3),
- ex.chars.inter_len(&c4),
- ex.chars.diff(&in_batch_extra).len(),
- );
- (i, ex, weight)
- })
- .max_by_key(|(_, _, w)| *w);
- if let Some((i, ex, w)) = best {
- println!("{:?}\t{} - {}", w, ex.ja, ex.en);
-
- batch.extra_examples.push(ex.clone());
- in_batch = in_batch.union(&ex.chars);
- in_batch_extra = in_batch_extra.union(&ex.chars);
-
- for c in ex.chars.iter() {
- *char_seen_count.entry(c).or_default() += 1;
- if batch.chars.contains(c) {
- *batch_char_seen_count.entry(c).or_default() += 1;
- }
- }
-
- candidates.remove(i);
- } else {
- break;
- }
- }
-
- batch
- .extra_examples
- .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
-
- for i in 1..20 {
- println!(
- "Seen {:02}: {}",
- i,
- char_seen_count.iter().filter(|(_, v)| **v == i).count()
- );
- }
- println!(
- "Seen more: {}",
- char_seen_count.iter().filter(|(_, v)| **v >= 20).count()
- );
- }
-}
diff --git a/src/server.rs b/src/server.rs
index 83b9151..58c4dee 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -1,10 +1,11 @@
use std::fs;
+use std::io::Write;
use std::sync::Arc;
use anyhow::anyhow;
+use rand::prelude::*;
use http_types::mime;
-use tide::prelude::*;
use tide::Request;
use crate::datafiles::*;
@@ -14,6 +15,27 @@ use crate::*;
pub async fn server_main() -> tide::Result<()> {
// ---- load data files ----
+ eprintln!("Loading kanji levels...");
+ let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
+ let all_kanji = Charset::new(
+ kanji_levels
+ .iter()
+ .map(|(_, x)| x.to_string())
+ .collect::<Vec<_>>()
+ .join(""),
+ );
+
+ eprintln!("Loading examples...");
+ let mut examples = read_examples(&all_kanji).expect("read_examples");
+ examples.retain(|e| (5..=25).contains(&e.ja.chars().count()));
+ let examples = Box::leak(examples.into_boxed_slice());
+
+ eprintln!("Counting chars in examples...");
+ let example_freq = calc_example_freq(&examples);
+
+ eprintln!("Loading furigana overrides...");
+ let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");
+
eprintln!("Loading JMdict_e.xml...");
let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict_raw: &'static str = String::leak(jmdict_raw);
@@ -36,9 +58,6 @@ pub async fn server_main() -> tide::Result<()> {
let batches = read_batches().expect("read/parse");
let batches = Box::leak(batches.into_boxed_slice());
- eprintln!("Loading kanji levels...");
- let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
-
let mut index_bytes = Vec::new();
format_index_to(&mut index_bytes, &batches, &kanji_levels).unwrap();
let index = String::leak(String::from_utf8(index_bytes).unwrap());
@@ -51,6 +70,9 @@ pub async fn server_main() -> tide::Result<()> {
jmdict_idx,
batches,
index,
+ examples,
+ example_freq,
+ furigana_overrides,
});
let mut app = tide::with_state(state);
@@ -60,6 +82,7 @@ pub async fn server_main() -> tide::Result<()> {
app.at("/index.html").get(home_page);
app.at("/style.css").serve_file("static/style.css")?;
app.at("/about.html").get(about_page);
+ app.at("/ex/:start/:end").get(gen_examples_page);
app.at("/:batch").get(batch_page);
// ---- serve actual http ----
@@ -78,6 +101,9 @@ struct StateStruct {
jmdict_idx: DictIndex<'static>,
batches: &'static [Batch],
index: &'static str,
+ examples: &'static [Example],
+ example_freq: HashMap<char, usize>,
+ furigana_overrides: HashMap<String, String>,
}
async fn home_page(req: Request<State>) -> tide::Result {
@@ -122,3 +148,119 @@ async fn batch_page(req: Request<State>) -> tide::Result {
.content_type(mime::HTML)
.build())
}
+
+async fn gen_examples_page(req: Request<State>) -> tide::Result {
+ let first_level: usize = req.param("start")?.parse()?;
+ let last_level: usize = req.param("end")?.parse()?;
+
+ let allowed_chars = Charset::from_iter(
+ req.state()
+ .batches
+ .get(..=last_level)
+ .unwrap_or_default()
+ .iter()
+ .map(|b| b.chars.iter())
+ .flatten(),
+ );
+ let needed_chars = Charset::from_iter(
+ req.state()
+ .batches
+ .get(first_level..=last_level)
+ .unwrap_or_default()
+ .iter()
+ .map(|b| b.chars.iter())
+ .flatten(),
+ );
+
+ let mut examples = gen_examples(&req.state(), &allowed_chars, &needed_chars, 50);
+ for ex in examples.iter_mut() {
+ ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides);
+ }
+
+ let mut buf: Vec<u8> = vec![];
+ for ex in examples.iter() {
+ write!(&mut buf, "<p>{}</p>", ex.furigana_markup())?;
+ }
+
+ Ok(tide::Response::builder(200)
+ .body(buf)
+ .content_type(mime::HTML)
+ .build())
+}
+
+// ---- example calculation ----
+
+fn calc_example_freq(examples: &[Example]) -> HashMap<char, usize> {
+ let mut ret = HashMap::new();
+ for ex in examples.iter() {
+ for c in ex.chars.iter() {
+ *ret.entry(c).or_default() += 1;
+ }
+ }
+ ret
+}
+
+fn gen_examples(
+ data: &StateStruct,
+ allowed_chars: &Charset,
+ needed_chars: &Charset,
+ count: usize,
+) -> Vec<Example> {
+ let mut rng = thread_rng();
+ let mut ret = vec![];
+
+ let mut candidates = data
+ .examples
+ .iter()
+ .filter(|x| x.chars.diff(&allowed_chars).is_empty() && x.chars.intersects(&needed_chars))
+ .map(|ex| (ex, *ex.chars.iter().filter_map(|x| data.example_freq.get(&x)).min().unwrap()))
+ .collect::<Vec<_>>();
+ let mut remaining_needed = needed_chars.clone();
+
+ let mut have_chars = Charset::new("");
+ println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars");
+ while ret.len() < count {
+ let mut selection = None;
+ let mut total_weight = 0f64;
+
+ let mut counted = 0;
+ for (i, (x, f)) in candidates.iter().enumerate() {
+ if remaining_needed.len() > 0 && !x.chars.intersects(&remaining_needed) {
+ continue;
+ }
+
+ counted += 1;
+ // compensate twice for rare characters
+ // - once to bring all chars to equal probability of sampling
+ // - once to over-sample rare chars because we need to see them more
+ let weight = 1f64 / (*f * *f) as f64;
+ total_weight += weight;
+ let rand: f64 = rng.gen();
+ if rand < weight / total_weight {
+ selection = Some((i, *f))
+ }
+ }
+
+ if let Some((i, f)) = selection {
+ let (ex, _) = candidates.remove(i);
+ remaining_needed = remaining_needed.diff(&ex.chars);
+ have_chars = have_chars.union(&ex.chars);
+ ret.push(ex.clone());
+
+ println!(
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}",
+ ret.len(),
+ f,
+ have_chars.len(),
+ remaining_needed.len(),
+ allowed_chars.len(),
+ counted,
+ ex.chars.to_string()
+ );
+ } else {
+ break;
+ }
+ }
+
+ ret
+}