aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-07-22 09:37:09 +0200
committerAlex Auvolat <alex@adnab.me>2023-07-22 09:37:09 +0200
commit63583ea63cc2f81b030b31400d861e83e5023d23 (patch)
treed383d7c96389dd99cea2024df906d60676ae9857 /src
parenta359c9da4ed354f0b0061be88a2376fb34d6348f (diff)
downloaddatagengo-63583ea63cc2f81b030b31400d861e83e5023d23.tar.gz
datagengo-63583ea63cc2f81b030b31400d861e83e5023d23.zip
changes to generation algorithm
- lexical ordering of cost instead of weighting function - allow jinmeiyou kanji - avoid all rare kanji
Diffstat (limited to 'src')
-rw-r--r--src/main.rs68
1 files changed, 54 insertions, 14 deletions
diff --git a/src/main.rs b/src/main.rs
index 8a74d82..d0f896d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -31,8 +31,8 @@ fn main() {
match opt.cmd {
Cmd::ParseKanjidic => {
let levels = parse_kanjidic().expect("error");
- for (jlpt, grade, chars) in levels.iter() {
- println!("{}.{}: {}", jlpt, grade, chars);
+ for (level, chars) in levels.iter() {
+ println!("{}: {}", level, chars.to_string());
}
}
Cmd::New{ count } => {
@@ -103,7 +103,7 @@ fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
ret
}
-fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> {
+fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
let file = fs::read_to_string("data/kanjidic2.xml")?;
let xml = roxmltree::Document::parse(&file)?;
let kanjidic = xml.root().first_child().unwrap();
@@ -124,27 +124,44 @@ fn parse_kanjidic() -> Result<Vec<(i32, i32, String)>> {
}
if y.has_tag_name("misc") {
for z in y.children() {
- if z.has_tag_name("grade") {
- grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
- }
if z.has_tag_name("jlpt") {
jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
}
+ if z.has_tag_name("grade") {
+ grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
+ }
}
}
}
- if jlpt.is_none() && grade.is_none() {
- continue;
- }
- let level = (jlpt.unwrap_or(0), grade.unwrap_or(0));
if let Some(lit) = literal {
- levels.entry(level).or_insert(String::new()).extend(lit.chars());
+ levels.entry((jlpt, grade)).or_insert(String::new()).extend(lit.chars());
}
}
- let mut levels = levels.into_iter().map(|((j, g), c)| (j, g, c)).collect::<Vec<_>>();
- levels.sort_by_key(|(j, g, _)| (-*j, *g));
- Ok(levels)
+ let mut levels = levels.into_iter().collect::<Vec<_>>();
+ levels.sort_by_key(|((j, g), _)| match (j, g) {
+ (Some(j), Some(g)) => (10-*j)*20+*g,
+ (Some(j), None) => (10-*j)*20+15,
+ (None, Some(g)) => 1000+*g,
+ (None, None) => 1015,
+
+ });
+
+ let mut ret = Vec::new();
+ let mut pc = Charset::default();
+ for ((j, g), chars) in levels.into_iter() {
+ let name = match (j, g) {
+ (Some(j), Some(g)) => format!("N{}-{}", j, g),
+ (Some(j), None) => format!("N{}+", j),
+ (None, Some(g)) => format!("N0-{}", g),
+ (None, None) => format!("N0+"),
+ };
+ let chars = Charset::new(chars).diff(&pc);
+ pc = pc.union(&chars);
+ ret.push((name, chars));
+ }
+
+ Ok(ret)
}
fn read_kanji_levels() -> Result<Vec<(String, String)>> {
@@ -219,6 +236,10 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
.map(|(_, c)| c.chars().iter().copied())
.flatten());
+ let mut chars_bad_avoid = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
+ .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
+ .map(|(_, c)| c.chars().iter().copied())
+ .flatten());
let mut batch = Batch {
level: target_level.to_string(),
@@ -248,6 +269,7 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
chars_late = chars_late.union(&target_chars);
target_chars = target_chars.union(&kanji_levels[target_i].1.diff(&prev_chars));
chars_bad = chars_bad.diff(&target_chars);
+ chars_bad_avoid = chars_bad_avoid.diff(&target_chars);
if batch.examples.is_empty() {
batch.level = kanji_levels[target_i].0.to_string();
} else {
@@ -259,6 +281,7 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
eprintln!("Bad : {} characters", chars_bad.len());
stalled = false;
}
+ /* this one works well enough
let cost = |ex: &Example, ex_tgt_inter: usize| {
20i32 * ex_tgt_inter as i32
+ 30i32 * ex.chars.inter_len(&chars_late) as i32
@@ -267,6 +290,14 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
+ 3i32 * ex.chars.inter_len(&chars_p2) as i32
- 40i32 * ex.chars.inter_len(&chars_bad) as i32
};
+ */
+ let cost = |ex: &Example, ex_tgt_inter: usize| { (
+ - (ex.chars.inter_len(&chars_bad_avoid) as i32),
+ ex_tgt_inter,
+ ex.chars.inter_len(&chars_late),
+ 2*ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
+ - (ex.ja.chars().count() as i32),
+ ) };
let cand_1 = examples.par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
.filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len)
@@ -461,6 +492,9 @@ fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<
writeln!(f, "<table>")?;
writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?;
for (lvl, chars) in kanji_levels.iter() {
+ if lvl == "N0+" || lvl == "N0-9" || lvl.ends_with("-10") {
+ continue;
+ }
let chars = Charset::new(chars);
let missing = chars.diff(&all_chars);
writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
@@ -523,6 +557,12 @@ impl Charset {
false
}
fn inter_len(&self, other: &Self) -> usize {
+ if other.len() > 20*self.len() {
+ // alternative path
+ return self.0.iter()
+ .filter(|x| other.0.binary_search(x).is_ok())
+ .count();
+ }
let mut it1 = self.0.iter().peekable();
let mut it2 = other.0.iter().peekable();
let mut ret = 0;