diff options
author | Alex Auvolat <alex@adnab.me> | 2023-07-22 09:37:09 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-07-22 09:37:09 +0200 |
commit | 63583ea63cc2f81b030b31400d861e83e5023d23 (patch) | |
tree | d383d7c96389dd99cea2024df906d60676ae9857 /src | |
parent | a359c9da4ed354f0b0061be88a2376fb34d6348f (diff) | |
download | datagengo-63583ea63cc2f81b030b31400d861e83e5023d23.tar.gz datagengo-63583ea63cc2f81b030b31400d861e83e5023d23.zip |
changes to generation algorithm
- lexical ordering of cost instead of weighting function
- allow jinmeiyou kanji
- avoid all rare kanji
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 68 |
1 files changed, 54 insertions, 14 deletions
diff --git a/src/main.rs b/src/main.rs index 8a74d82..d0f896d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,8 +31,8 @@ fn main() { match opt.cmd { Cmd::ParseKanjidic => { let levels = parse_kanjidic().expect("error"); - for (jlpt, grade, chars) in levels.iter() { - println!("{}.{}: {}", jlpt, grade, chars); + for (level, chars) in levels.iter() { + println!("{}: {}", level, chars.to_string()); } } Cmd::New{ count } => { @@ -103,7 +103,7 @@ fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { ret } -fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> { +fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { let file = fs::read_to_string("data/kanjidic2.xml")?; let xml = roxmltree::Document::parse(&file)?; let kanjidic = xml.root().first_child().unwrap(); @@ -124,27 +124,44 @@ fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> { } if y.has_tag_name("misc") { for z in y.children() { - if z.has_tag_name("grade") { - grade = z.text().and_then(|x| str::parse::<i32>(x).ok()); - } if z.has_tag_name("jlpt") { jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok()); } + if z.has_tag_name("grade") { + grade = z.text().and_then(|x| str::parse::<i32>(x).ok()); + } } } } - if jlpt.is_none() && grade.is_none() { - continue; - } - let level = (jlpt.unwrap_or(0), grade.unwrap_or(0)); if let Some(lit) = literal { - levels.entry(level).or_insert(String::new()).extend(lit.chars()); + levels.entry((jlpt, grade)).or_insert(String::new()).extend(lit.chars()); } } - let mut levels = levels.into_iter().map(|((j, g), c)| (j, g, c)).collect::<Vec<_>>(); - levels.sort_by_key(|(j, g, _)| (-*j, *g)); - Ok(levels) + let mut levels = levels.into_iter().collect::<Vec<_>>(); + levels.sort_by_key(|((j, g), _)| match (j, g) { + (Some(j), Some(g)) => (10-*j)*20+*g, + (Some(j), None) => (10-*j)*20+15, + (None, Some(g)) => 1000+*g, + (None, None) => 1015, + + }); + + let mut ret = Vec::new(); + let mut pc = Charset::default(); + for ((j, g), chars) in levels.into_iter() { + let name = match (j, g) { + (Some(j), Some(g)) => format!("N{}-{}", j, g), + (Some(j), None) => format!("N{}+", j), + (None, Some(g)) => format!("N0-{}", g), + (None, None) => format!("N0+"), + }; + let chars = Charset::new(chars).diff(&pc); + pc = pc.union(&chars); + ret.push((name, chars)); + } + + Ok(ret) } fn read_kanji_levels() -> Result<Vec<(String, String)>> { @@ -219,6 +236,10 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1) .map(|(_, c)| c.chars().iter().copied()) .flatten()); + let mut chars_bad_avoid = Charset::from_iter(kanji_levels.iter().skip(target_i+1) + .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10")) + .map(|(_, c)| c.chars().iter().copied()) + .flatten()); let mut batch = Batch { level: target_level.to_string(), @@ -248,6 +269,7 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & chars_late = chars_late.union(&target_chars); target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars)); chars_bad = chars_bad.diff(&target_chars); + chars_bad_avoid = chars_bad_avoid.diff(&target_chars); if batch.examples.is_empty() { batch.level = kanji_levels[target_i].0.to_string(); } else { @@ -259,6 +281,7 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & eprintln!("Bad : {} characters", chars_bad.len()); stalled = false; } + /* this one works well enough let cost = |ex: &Example, ex_tgt_inter: usize| { 20i32 * ex_tgt_inter as i32 + 30i32 * ex.chars.inter_len(&chars_late) as i32 @@ -267,6 +290,14 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: & + 3i32 * ex.chars.inter_len(&chars_p2) as i32 - 40i32 * ex.chars.inter_len(&chars_bad) as i32 }; + */ + let cost = |ex: &Example, ex_tgt_inter: usize| { ( + - (ex.chars.inter_len(&chars_bad_avoid) as i32), + ex_tgt_inter, + ex.chars.inter_len(&chars_late), + 2*ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2), + - (ex.ja.chars().count() as i32), + ) }; let cand_1 = examples.par_iter() .map(|ex| (ex, ex.chars.inter_len(&target_chars))) .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len) @@ -461,6 +492,9 @@ fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result< writeln!(f, "<table>")?; writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?; for (lvl, chars) in kanji_levels.iter() { + if lvl == "N0+" || lvl == "N0-9" || lvl.ends_with("-10") { + continue; + } let chars = Charset::new(chars); let missing = chars.diff(&all_chars); writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>", @@ -523,6 +557,12 @@ impl Charset { false } fn inter_len(&self, other: &Self) -> usize { + if other.len() > 20*self.len() { + // alternative path + return self.0.iter() + .filter(|x| other.0.binary_search(x).is_ok()) + .count(); + } let mut it1 = self.0.iter().peekable(); let mut it2 = other.0.iter().peekable(); let mut ret = 0; |