aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-09-24 17:26:00 +0200
committerAlex Auvolat <alex@adnab.me>2023-09-24 17:26:00 +0200
commita9de8d71a0fecbd483cbdc084ba109cb96250aaa (patch)
treecfd7c1551e56bf210fd119d54583e6e83f19bb0e
parent22bd3b7e415c5a33e8101bc38acaaab22de5b316 (diff)
downloaddatagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.tar.gz
datagengo-a9de8d71a0fecbd483cbdc084ba109cb96250aaa.zip
refactor; add simplify to remove ONE redundant sentencebatches-v1
-rw-r--r--data/batches.json10
-rw-r--r--src/charset.rs190
-rw-r--r--src/main.rs532
3 files changed, 439 insertions, 293 deletions
diff --git a/data/batches.json b/data/batches.json
index b8a3f59..cbcb15e 100644
--- a/data/batches.json
+++ b/data/batches.json
@@ -664,16 +664,6 @@
]
},
{
- "ja": "オーストラリアは南アメリカより小さい。",
- "en": "Australia is smaller than South America.",
- "expl": "濠太剌利{オーストラリア} は 南 亜米利加{アメリカ} より 小さい",
- "id": "ID=65144_227791",
- "chars": [
- "南",
- "小"
- ]
- },
- {
"ja": "月は東から上る。",
"en": "The moon rises in the east.",
"expl": "月(つき)[01] は 東(ひがし) から 登る[02]{上る}",
diff --git a/src/charset.rs b/src/charset.rs
new file mode 100644
index 0000000..a79eddb
--- /dev/null
+++ b/src/charset.rs
@@ -0,0 +1,190 @@
+use std::cmp::Ordering;
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)]
+pub struct Charset(pub Vec<char>);
+
+impl Charset {
+ pub fn new<S: AsRef<str>>(s: S) -> Self {
+ let mut chars = s.as_ref().chars().collect::<Vec<_>>();
+ chars.sort();
+ chars.dedup();
+ Self(chars)
+ }
+ pub fn from_iter<S: IntoIterator<Item = char>>(s: S) -> Self {
+ let mut chars = s.into_iter().collect::<Vec<_>>();
+ chars.sort();
+ chars.dedup();
+ Self(chars)
+ }
+ pub fn intersects(&self, other: &Self) -> bool {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => return true,
+ Ordering::Less => it1.next(),
+ Ordering::Greater => it2.next(),
+ };
+ }
+ false
+ }
+ pub fn inter_len(&self, other: &Self) -> usize {
+ if other.len() > 20 * self.len() {
+ // alternative path
+ return self
+ .0
+ .iter()
+ .filter(|x| other.0.binary_search(x).is_ok())
+ .count();
+ }
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = 0;
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ ret += 1;
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ it1.next();
+ }
+ Ordering::Greater => {
+ it2.next();
+ }
+ };
+ }
+ ret
+ }
+ pub fn inter(&self, other: &Self) -> Charset {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = Vec::new();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ ret.push(**c1);
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ it1.next();
+ }
+ Ordering::Greater => {
+ it2.next();
+ }
+ };
+ }
+ Self(ret)
+ }
+ pub fn union(&self, other: &Self) -> Charset {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = Vec::new();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ ret.push(**c1);
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ ret.push(**c1);
+ it1.next();
+ }
+ Ordering::Greater => {
+ ret.push(**c2);
+ it2.next();
+ }
+ };
+ }
+ while let Some(c) = it1.peek() {
+ ret.push(**c);
+ it1.next();
+ }
+ while let Some(c) = it2.peek() {
+ ret.push(**c);
+ it2.next();
+ }
+ Self(ret)
+ }
+ pub fn diff(&self, other: &Self) -> Charset {
+ let mut it1 = self.0.iter().peekable();
+ let mut it2 = other.0.iter().peekable();
+ let mut ret = Vec::new();
+ while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
+ match c1.cmp(c2) {
+ Ordering::Equal => {
+ it1.next();
+ it2.next();
+ }
+ Ordering::Less => {
+ ret.push(**c1);
+ it1.next();
+ }
+ Ordering::Greater => {
+ it2.next();
+ }
+ };
+ }
+ while let Some(c) = it1.peek() {
+ ret.push(**c);
+ it1.next();
+ }
+ Self(ret)
+ }
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+ pub fn is_empty(&self) -> bool {
+ self.0.is_empty()
+ }
+ pub fn chars(&self) -> &[char] {
+ &self.0
+ }
+ pub fn contains(&self, c: char) -> bool {
+ self.0.binary_search(&c).is_ok()
+ }
+ pub fn to_string(&self) -> String {
+ self.0.iter().collect::<String>()
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_charset() {
+ let c1 = Charset::new("azerty");
+ let c2 = Charset::new("uiopqsqdf");
+ let c3 = Charset::new("hello, world");
+
+ assert!(!c1.intersects(&c2));
+ assert!(c1.intersects(&c3));
+ assert!(c2.intersects(&c3));
+
+ assert_eq!(c1.inter_len(&c2), 0);
+ assert_eq!(c1.inter_len(&c3), 2);
+ assert_eq!(c2.inter_len(&c3), 2);
+
+ assert_eq!(c1.inter(&c2), Charset::new(""));
+ assert_eq!(c1.inter(&c3), Charset::new("er"));
+ assert_eq!(c2.inter(&c3), Charset::new("od"));
+
+ assert_eq!(c1.union(&c2), Charset::new("azertyuiopqsdf"));
+ assert_eq!(c1.union(&c3), Charset::new("azertyhello, world"));
+ assert_eq!(c2.union(&c3), Charset::new("uiopqsdfhello, world"));
+
+ assert_eq!(c1.diff(&c2), Charset::new("azerty"));
+ assert_eq!(c1.diff(&c3), Charset::new("azty"));
+ assert_eq!(c2.diff(&c3), Charset::new("uipqsf"));
+
+ assert_eq!(c2.diff(&c1), Charset::new("uiopqsdf"));
+ assert_eq!(c3.diff(&c1), Charset::new("hllo, wold"));
+ assert_eq!(c3.diff(&c2), Charset::new("hell, wrl"));
+ }
+}
diff --git a/src/main.rs b/src/main.rs
index 837b7ff..a283891 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,13 +1,15 @@
use std::collections::HashMap;
use std::fs;
-use std::cmp::Ordering;
use std::io::{self, BufRead, Write};
use anyhow::{anyhow, Result};
use rayon::prelude::*;
-use serde::{Serialize, Deserialize};
+use serde::{Deserialize, Serialize};
use structopt::StructOpt;
+mod charset;
+use charset::Charset;
+
#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
struct Opt {
@@ -17,11 +19,12 @@ struct Opt {
#[derive(Debug, StructOpt)]
enum Cmd {
- ParseKanjidic,
+ ParseKanjidic,
New {
#[structopt(default_value = "10")]
count: usize,
},
+ Simplify,
Format,
}
@@ -35,13 +38,17 @@ fn main() {
println!("{}: {}", level, chars.to_string());
}
}
- Cmd::New{ count } => {
+ Cmd::New { count } => {
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
- let all_kanji = Charset::new(kanji_levels.iter()
- .map(|(_, x)| x.to_string())
- .collect::<Vec<_>>()
- .join(""));
- let kanji_levels = kanji_levels.into_iter()
+ let all_kanji = Charset::new(
+ kanji_levels
+ .iter()
+ .map(|(_, x)| x.to_string())
+ .collect::<Vec<_>>()
+ .join(""),
+ );
+ let kanji_levels = kanji_levels
+ .into_iter()
.map(|(l, x)| (l, Charset::new(x)))
.collect::<Vec<_>>();
let mut ex = read_examples(&all_kanji).expect("read_examples");
@@ -58,18 +65,40 @@ fn main() {
}
batches.push(batch);
}
- fs::write("data/batches.json", serde_json::to_string_pretty(&batches).expect("serialize").as_bytes()).expect("save");
+ fs::write(
+ "data/batches.json",
+ serde_json::to_string_pretty(&batches)
+ .expect("serialize")
+ .as_bytes(),
+ )
+ .expect("save");
+ }
+ Cmd::Simplify => {
+ let mut batches: Vec<Batch> = fs::read("data/batches.json")
+ .map_err(anyhow::Error::from)
+ .and_then(|x| Ok(serde_json::from_slice(&x)?))
+ .unwrap_or_default();
+ for batch in batches.iter_mut() {
+ simplify_batch(batch);
+ }
+ fs::write(
+ "data/batches.json",
+ serde_json::to_string_pretty(&batches)
+ .expect("serialize")
+ .as_bytes(),
+ )
+ .expect("save");
}
Cmd::Format => {
- let jmdict = fs::read_to_string("data/JMdict_e.xml")
- .expect("read_jmdict");
+ let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict = roxmltree::Document::parse_with_options(
&jmdict,
roxmltree::ParsingOptions {
allow_dtd: true,
..Default::default()
- })
- .expect("parse_jmdict");
+ },
+ )
+ .expect("parse_jmdict");
let jmdict_idx = index_jmdict(&jmdict);
let batches = fs::read("data/batches.json")
@@ -80,7 +109,8 @@ fn main() {
fs::create_dir_all("public").expect("mkdir public");
fs::copy("static/style.css", "public/style.css").expect("copy style.css");
- batches.iter()
+ batches
+ .iter()
.enumerate()
.for_each(|x| format_batch(&jmdict_idx, batches.len(), x));
@@ -90,9 +120,17 @@ fn main() {
}
}
+// =====================================================================
+// PARSING DATA FILES
+// =====================================================================
+
type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
- let dict = dict.root().children().find(|x| x.has_tag_name("JMdict")).unwrap();
+ let dict = dict
+ .root()
+ .children()
+ .find(|x| x.has_tag_name("JMdict"))
+ .unwrap();
let mut ret: DictIndex<'a> = HashMap::new();
for x in dict.children().filter(|x| x.has_tag_name("entry")) {
@@ -138,17 +176,19 @@ fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
}
}
if let Some(lit) = literal {
- levels.entry((jlpt, grade)).or_insert(String::new()).extend(lit.chars());
+ levels
+ .entry((jlpt, grade))
+ .or_insert(String::new())
+ .extend(lit.chars());
}
}
let mut levels = levels.into_iter().collect::<Vec<_>>();
levels.sort_by_key(|((j, g), _)| match (j, g) {
- (Some(j), Some(g)) => (10-*j)*20+*g,
- (Some(j), None) => (10-*j)*20+15,
- (None, Some(g)) => 1000+*g,
- (None, None) => 1015,
-
+ (Some(j), Some(g)) => (10 - *j) * 20 + *g,
+ (Some(j), None) => (10 - *j) * 20 + 15,
+ (None, Some(g)) => 1000 + *g,
+ (None, None) => 1015,
});
let mut ret = Vec::new();
@@ -212,38 +252,87 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
}
}
if i % 10000 == 0 {
- eprintln!("read examples: {}/300", i/1000);
+ eprintln!("read examples: {}/300", i / 1000);
}
}
Ok(ret)
}
-fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &[Example]) -> Result<Batch> {
- let prev_chars = Charset::from_iter(previous.iter()
- .map(|x| x.chars.chars().iter().copied())
- .flatten());
+// =====================================================================
+// BATCH STRUCTURES AND GENERATION
+// =====================================================================
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct Example {
+ ja: String,
+ en: String,
+ expl: String,
+ id: Option<String>,
+ chars: Charset,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct Batch {
+ level: String,
+ chars: Charset,
+ chars_p1: Charset,
+ chars_p2: Charset,
+ chars_bad: Charset,
+ examples: Vec<Example>,
+}
- let (mut target_i, target_level, mut target_chars) = kanji_levels.iter().enumerate()
+fn gen_batch(
+ previous: &[Batch],
+ kanji_levels: &[(String, Charset)],
+ examples: &[Example],
+) -> Result<Batch> {
+ let prev_chars = Charset::from_iter(
+ previous
+ .iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten(),
+ );
+
+ let (mut target_i, target_level, mut target_chars) = kanji_levels
+ .iter()
+ .enumerate()
.map(|(i, (l, c))| (i, l, c.diff(&prev_chars)))
.find(|(_, _, c)| !c.is_empty())
.ok_or(anyhow!("no more batches to make!"))?;
- let chars_p1 = previous.iter().rev().next()
- .map(|b| b.chars.clone()).unwrap_or(Charset::default());
-
- let chars_p2 = previous.iter().rev().skip(1).next()
- .map(|b| b.chars.clone()).unwrap_or(Charset::default());
+ let chars_p1 = previous
+ .iter()
+ .rev()
+ .next()
+ .map(|b| b.chars.clone())
+ .unwrap_or(Charset::default());
+
+ let chars_p2 = previous
+ .iter()
+ .rev()
+ .skip(1)
+ .next()
+ .map(|b| b.chars.clone())
+ .unwrap_or(Charset::default());
let mut chars_late = Charset::default();
- let mut chars_bad = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
- .map(|(_, c)| c.chars().iter().copied())
- .flatten());
- let mut chars_bad_avoid = Charset::from_iter(kanji_levels.iter().skip(target_i+1)
- .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
- .map(|(_, c)| c.chars().iter().copied())
- .flatten());
+ let mut chars_bad = Charset::from_iter(
+ kanji_levels
+ .iter()
+ .skip(target_i + 1)
+ .map(|(_, c)| c.chars().iter().copied())
+ .flatten(),
+ );
+ let mut chars_bad_avoid = Charset::from_iter(
+ kanji_levels
+ .iter()
+ .skip(target_i + 1)
+ .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
+ .map(|(_, c)| c.chars().iter().copied())
+ .flatten(),
+ );
let mut batch = Batch {
level: target_level.to_string(),
@@ -295,31 +384,39 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
- 40i32 * ex.chars.inter_len(&chars_bad) as i32
};
*/
- let cost = |ex: &Example, ex_tgt_inter: usize| { (
- - (ex.chars.inter_len(&chars_bad_avoid) as i32),
- ex_tgt_inter,
- ex.chars.inter_len(&chars_late),
- 2*ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
- - (ex.ja.chars().count() as i32),
- ) };
- let cand_1 = examples.par_iter()
+ let cost = |ex: &Example, ex_tgt_inter: usize| {
+ (
+ -(ex.chars.inter_len(&chars_bad_avoid) as i32),
+ ex_tgt_inter,
+ ex.chars.inter_len(&chars_late),
+ 2 * ex.chars.inter_len(&chars_p1) + ex.chars.inter_len(&chars_p2),
+ -(ex.ja.chars().count() as i32),
+ )
+ };
+ let cand_1 = examples
+ .par_iter()
.map(|ex| (ex, ex.chars.inter_len(&target_chars)))
- .filter(|(_, ex_tgt_inter)| (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len)
+ .filter(|(_, ex_tgt_inter)| {
+ (1..=4).contains(ex_tgt_inter) && *ex_tgt_inter + batch.chars.len() <= batch_len
+ })
.max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter));
- let cand = cand_1.or_else(|| {
- examples.par_iter()
- .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
- .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
- .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
+ let cand = cand_1.or_else(|| {
+ examples
+ .par_iter()
+ .map(|ex| (ex, ex.chars.inter_len(&target_chars)))
+ .filter(|(_, ex_tgt_inter)| *ex_tgt_inter > 0)
+ .max_by_key(|(ex, ex_tgt_inter)| cost(ex, *ex_tgt_inter))
});
if let Some((ex, _)) = cand {
- eprintln!("* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
+ eprintln!(
+ "* add {} (rep: {}, p1: {}, p2: {}, bad: {}) {}",
ex.chars.inter(&target_chars).to_string(),
ex.chars.inter(&batch.chars).to_string(),
ex.chars.inter(&chars_p1).to_string(),
ex.chars.inter(&chars_p2).to_string(),
ex.chars.inter(&chars_bad).to_string(),
- ex.ja);
+ ex.ja
+ );
batch.chars = batch.chars.union(&ex.chars.inter(&target_chars));
target_chars = target_chars.diff(&ex.chars);
chars_late = chars_late.diff(&ex.chars);
@@ -328,7 +425,10 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
stalled = false;
} else {
if stalled {
- eprintln!("could not find suitable sentence, stopping batch now (need {})", need);
+ eprintln!(
+ "could not find suitable sentence, stopping batch now (need {})",
+ need
+ );
break;
}
stalled = true;
@@ -342,20 +442,63 @@ fn gen_batch(previous: &[Batch], kanji_levels: &[(String, Charset)], examples: &
Ok(batch)
}
+fn simplify_batch(batch: &mut Batch) {
+ let mut char_cnt = HashMap::<char, usize>::new();
+ for ex in batch.examples.iter() {
+ for ch in batch.chars.inter(&ex.chars).chars() {
+ *char_cnt.entry(*ch).or_default() += 1;
+ }
+ }
+
+ loop {
+ let i_opt = batch.examples.iter().position(|ex| {
+ batch
+ .chars
+ .inter(&ex.chars)
+ .chars()
+ .iter()
+ .all(|x| char_cnt[x] >= 2)
+ });
+ if let Some(i) = i_opt {
+ println!(
+ "Removing {} [{}]",
+ batch.examples[i].ja,
+ batch.examples[i].chars.to_string()
+ );
+ batch.examples.remove(i);
+ } else {
+ break;
+ }
+ }
+}
+
+// =====================================================================
+// FORMATTING TO HTML
+// =====================================================================
+
fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
format_batch_aux(dict_idx, count, i, batch).expect("format batch");
}
-fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: &Batch) -> Result<()> {
+fn format_batch_aux<'a>(
+ dict_idx: &DictIndex<'a>,
+ count: usize,
+ i: usize,
+ batch: &Batch,
+) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?);
- write!(f, r#"<!DOCTYPE html>
+ write!(
+ f,
+ r#"<!DOCTYPE html>
<html>
<head>
<meta charset=\"UTF-8\" />
<title>Batch #{:03}</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
- <body>"#, i)?;
+ <body>"#,
+ i
+ )?;
writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
for j in 0..count {
@@ -368,7 +511,11 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch:
writeln!(f, r#"</p>"#)?;
writeln!(f, "<p>Level: {}</p>", batch.level)?;
- writeln!(f, r#"<p class="ja chars">【{}】</p>"#, batch.chars.to_string())?;
+ writeln!(
+ f,
+ r#"<p class="ja chars">【{}】</p>"#,
+ batch.chars.to_string()
+ )?;
for ex in batch.examples.iter() {
writeln!(f, "<hr />")?;
@@ -415,7 +562,11 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch:
}
writeln!(f, r#"<p class="chars">"#)?;
for c in ex.chars.inter(&batch.chars).chars().iter() {
- writeln!(f, r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#, c, c)?;
+ writeln!(
+ f,
+ r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
+ c, c
+ )?;
}
writeln!(f, r#"</p>"#)?;
for be in expl_all {
@@ -436,7 +587,8 @@ fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
ret = s;
}
}
- let p = w.split_once('(')
+ let p = w
+ .split_once('(')
.and_then(|(_, r)| r.split_once(')'))
.map(|(p, _)| p);
(ret, p)
@@ -467,45 +619,63 @@ fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -
fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
let mut f = io::BufWriter::new(fs::File::create("public/index.html")?);
- write!(f, r#"<!DOCTYPE html>
+ write!(
+ f,
+ r#"<!DOCTYPE html>
<html>
<head>
<meta charset=\"UTF-8\" />
<title>List of batches</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
- <body>"#)?;
+ <body>"#
+ )?;
writeln!(f, "<table>")?;
writeln!(f, "<tr><th>Num</th><th>Level</th><th>Chars</th><th>Examples</th><th>Review</th><th>Ignore</th></tr>")?;
for (i, batch) in batches.iter().enumerate() {
- writeln!(f, r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td>&nbsp;&nbsp;{}</td><td>{}</td><td>{}</td></tr>"#,
- i, i,
- batch.level,
- batch.chars.to_string(),
- batch.examples.len(),
- batch.chars_p1.union(&batch.chars_p2).to_string(),
- batch.chars_bad.to_string())?;
+ writeln!(
+ f,
+ r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td>{}</td><td>&nbsp;&nbsp;{}</td><td>{}</td><td>{}</td></tr>"#,
+ i,
+ i,
+ batch.level,
+ batch.chars.to_string(),
+ batch.examples.len(),
+ batch.chars_p1.union(&batch.chars_p2).to_string(),
+ batch.chars_bad.to_string()
+ )?;
}
writeln!(f, r#"</table>"#)?;
writeln!(f, "<hr />")?;
- let all_chars = Charset::from_iter(batches.iter()
- .map(|x| x.chars.chars().iter().copied())
- .flatten());
+ let all_chars = Charset::from_iter(
+ batches
+ .iter()
+ .map(|x| x.chars.chars().iter().copied())
+ .flatten(),
+ );
writeln!(f, "<table>")?;
- writeln!(f, "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>")?;
+ writeln!(
+ f,
+ "<tr><th>Level</th><th>Count</th><th>Chars</th><th>Missing chars</th></tr>"
+ )?;
for (lvl, chars) in kanji_levels.iter() {
if lvl == "N0+" || lvl == "N0-9" || lvl.ends_with("-10") {
continue;
}
let chars = Charset::new(chars);
let missing = chars.diff(&all_chars);
- writeln!(f, "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
+ writeln!(
+ f,
+ "<tr><td>{}</td><td>{}</td><td>{}</td><td>{} ({})</td></tr>",
lvl,
- chars.len(), chars.to_string(),
- missing.to_string(), missing.len())?;
+ chars.len(),
+ chars.to_string(),
+ missing.to_string(),
+ missing.len()
+ )?;
}
writeln!(f, "</table>")?;
@@ -513,207 +683,3 @@ fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<
f.flush()?;
Ok(())
}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct Example {
- ja: String,
- en: String,
- expl: String,
- id: Option<String>,
- chars: Charset,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct Batch {
- level: String,
- chars: Charset,
- chars_p1: Charset,
- chars_p2: Charset,
- chars_bad: Charset,
- examples: Vec<Example>,
-}
-
-#[derive(Debug, Eq, PartialEq, Hash, Clone, Serialize, Deserialize, Default)]
-struct Charset(Vec<char>);
-
-impl Charset {
- fn new<S: AsRef<str>>(s: S) -> Self {
- let mut chars = s.as_ref().chars().collect::<Vec<_>>();
- chars.sort();
- chars.dedup();
- Self(chars)
- }
- fn from_iter<S: IntoIterator<Item=char>>(s: S) -> Self {
- let mut chars = s.into_iter().collect::<Vec<_>>();
- chars.sort();
- chars.dedup();
- Self(chars)
- }
- fn intersects(&self, other: &Self) -> bool {
- let mut it1 = self.0.iter().peekable();
- let mut it2 = other.0.iter().peekable();
- while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
- match c1.cmp(c2) {
- Ordering::Equal => return true,
- Ordering::Less => it1.next(),
- Ordering::Greater => it2.next(),
- };
- }
- false
- }
- fn inter_len(&self, other: &Self) -> usize {
- if other.len() > 20*self.len() {
- // alternative path
- return self.0.iter()
- .filter(|x| other.0.binary_search(x).is_ok())
- .count();
- }
- let mut it1 = self.0.iter().peekable();
- let mut it2 = other.0.iter().peekable();
- let mut ret = 0;
- while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
- match c1.cmp(c2) {
- Ordering::Equal => {
- ret += 1;
- it1.next();
- it2.next();
- }
- Ordering::Less => {
- it1.next();
- }
- Ordering::Greater => {
- it2.next();
- }
- };
- }
- ret
- }
- fn inter(&self, other: &Self) -> Charset {
- let mut it1 = self.0.iter().peekable();
- let mut it2 = other.0.iter().peekable();
- let mut ret = Vec::new();
- while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
- match c1.cmp(c2) {
- Ordering::Equal => {
- ret.push(**c1);
- it1.next();
- it2.next();
- }
- Ordering::Less => {
- it1.next();
- }
- Ordering::Greater => {
- it2.next();
- }
- };
- }
- Self(ret)
- }
- fn union(&self, other: &Self) -> Charset {
- let mut it1 = self.0.iter().peekable();
- let mut it2 = other.0.iter().peekable();
- let mut ret = Vec::new();
- while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
- match c1.cmp(c2) {
- Ordering::Equal => {
- ret.push(**c1);
- it1.next();
- it2.next();
- }
- Ordering::Less => {
- ret.push(**c1);
- it1.next();
- }
- Ordering::Greater => {
- ret.push(**c2);
- it2.next();
- }
- };
- }
- while let Some(c) = it1.peek() {
- ret.push(**c);
- it1.next();
- }
- while let Some(c) = it2.peek() {
- ret.push(**c);
- it2.next();
- }
- Self(ret)
- }
- fn diff(&self, other: &Self) -> Charset {
- let mut it1 = self.0.iter().peekable();
- let mut it2 = other.0.iter().peekable();
- let mut ret = Vec::new();
- while let (Some(c1), Some(c2)) = (it1.peek(), it2.peek()) {
- match c1.cmp(c2) {
- Ordering::Equal => {
- it1.next();
- it2.next();
- }
- Ordering::Less => {
- ret.push(**c1);
- it1.next();
- }
- Ordering::Greater => {
- it2.next();
- }
- };
- }
- while let Some(c) = it1.peek() {
- ret.push(**c);
- it1.next();
- }
- Self(ret)
- }
- fn len(&self) -> usize {
- self.0.len()
- }
- fn is_empty(&self) -> bool {
- self.0.is_empty()
- }
- fn chars(&self) -> &[char] {
- &self.0
- }
- fn contains(&self, c: char) -> bool {
- self.0.binary_search(&c).is_ok()
- }
- fn to_string(&self) -> String {
- self.0.iter().collect::<String>()
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_charset() {
- let c1 = Charset::new("azerty");
- let c2 = Charset::new("uiopqsqdf");
- let c3 = Charset::new("hello, world");
-
- assert!(!c1.intersects(&c2));
- assert!(c1.intersects(&c3));
- assert!(c2.intersects(&c3));
-
- assert_eq!(c1.inter_len(&c2), 0);
- assert_eq!(c1.inter_len(&c3), 2);
- assert_eq!(c2.inter_len(&c3), 2);
-
- assert_eq!(c1.inter(&c2), Charset::new(""));
- assert_eq!(c1.inter(&c3), Charset::new("er"));
- assert_eq!(c2.inter(&c3), Charset::new("od"));
-
- assert_eq!(c1.union(&c2), Charset::new("azertyuiopqsdf"));
- assert_eq!(c1.union(&c3), Charset::new("azertyhello, world"));
- assert_eq!(c2.union(&c3), Charset::new("uiopqsdfhello, world"));
-
- assert_eq!(c1.diff(&c2), Charset::new("azerty"));
- assert_eq!(c1.diff(&c3), Charset::new("azty"));
- assert_eq!(c2.diff(&c3), Charset::new("uipqsf"));
-
- assert_eq!(c2.diff(&c1), Charset::new("uiopqsdf"));
- assert_eq!(c3.diff(&c1), Charset::new("hllo, wold"));
- assert_eq!(c3.diff(&c2), Charset::new("hell, wrl"));
- }
-}