aboutsummaryrefslogblamecommitdiff
path: root/src/main.rs
blob: 7ecc195705130f991c2d9139aa6d6f9790711d07 (plain) (tree)
1
2
3
4
5
6
7
8
9
10

                              
                                    
 
                   
                     
                      
                                    

                         


                     








                                                                           
                  
                   


                                          

                                       
      
             
            
             
                
           







                                                          

                                                             
             
         

                                                                               

                                                                                          

                                                         
                                         
                                                                               








                                                

                                                   





                                                                           


                                       
                                                                           

                                                                      











                                                                       
                                                         









                                                      
         
                         


                                                                       
                                                         
                                                                               




                                                         







                                                      



                                                                       
                                                         









                                                                         















                                                                               
                                                         










                                                      
                        
                                                                                       




                                                                 


                                    

                                                   



                                                                            



                                                                                      

                       
                            
                                                                           


                                                                               
                                                  



         



                                                                        

                                                                     




                                           













                                                                             
                                                       

                                                                                  



















                                                         


                                                                                


                                                                                 


                 



                                                 
                                    






                                                                                     



                                         


         

                                                            



                                                  





                                               

                                                     

                                                          

                                              








                                                   















                                                               
                                                                   













                                                                  
                                                                     






                                                
                                                                     




                           
                           
                                                                 





           
                                                          









                                   







                                   




                                                        

                                           
             




                                        
             




                                        
             




                                        
             







                                        





                                      




                             












                                                            
                                    





               





                             














                                                                 
                                        












                                                                      
                                                                      








                                                                   


















                                                            



                                                                        



                                                          







                       
                                                                   






                           

                                

                                 
 
 


                             

                                       






                                                         
                           
          






                                                 

                                 

                                                                          








                                                                                   
 
                                                                             















                                                                                        
                                               








                                                         




                                    


                                            
 












                                                                                     








                                       
 


                             

                        




                                        
                                                                                 
             
                                                                                            
              



                                                                  
      
 


                                                                                          
 





                                            
 
          


                                                                  
                                
           



























                                                                                              
 







                                                                                              

                                                                                      


                                                                      


                                      








                                                                                 


                                                                   




                                                                                                   
                                  
         

                                                                                   
                                                                         
                                                      
                               
                            

                                          

                                      
                                        






                                                     
                                                     




                         


















                                                                                

                                  
                                                                
                                           




                                                                    













                                                             
                                                       
 






                                                  
                                        
                                        

                            


















                                                                            
                                                          








                                                                         







                                                                                 


























                                                                    





























                                                         













                                                                                   
                                     


                                                                   




                                       






                                                    
     

 

                                                              
                                            





                                            
                                                     

                                               
                                   






                                                         

                                                      

                                                  
                                     



                                                             








                                                                                             

                                 
                                            







                                              



                                                                               

 




                                                                  

                                 






                                                                     



                                          
                                                                               
















                                                                                                                        
                                                                                         
 






                                               



                                                                        

                                                                                          

 





                             
                                                                                       


                          





                                                                          
                                             

         
 
                                                         








                                                                     
 













                                                                             

















                                                                     




                                                                   


                                                             


                                              
                                                   
                                        





                                                              






                                             

                                                              




                                                                             

                                



                                             

     
                           

                 






                                                       

                 






                                                        
 



                                                                                                    
                                           

                 
                                                                                                                                        

                        


                                          


                                             
                                      



               

                                                                                  




                                                                          


                        
                                                                                                                                                          







                                              
                                                     





                                                     

                        


                                             

 
                                                                                                  

                                                                          
                                         
 




                                                









                                                                             
             

 
                                                                                     
                                                                           


                          





                                                                          
                                            
       
 

                                                                            
                            
                                                                                                                                        
                                                  

                 
                                                                                                                                                                                                                




                                    

                                       

                                       




                                





                                                     
                            

             
                                                                                                  
       
                                             
                                                 

                     

                                             

                 
                                                                                                                         
                




                                


                             
                                       


               















                                                                           



                                                                  
 




                                                            




                                            
use std::collections::HashMap;
use std::fs;
use std::io::{self, BufRead, Write};

use anyhow::Result;
use rand::prelude::*;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;

mod charset;
use charset::Charset;

#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
struct Opt {
    #[structopt(subcommand)]
    cmd: Cmd,
}

#[derive(Debug, StructOpt)]
enum Cmd {
    ParseKanjidic,
    ParseJlptVocab,
    New {
        #[structopt(default_value = "10")]
        count: usize,
        #[structopt(long = "truncate")]
        truncate: Option<usize>,
    },
    Simplify,
    Cleanup,
    AddVocab,
    AddExamples,
    Format,
}

fn main() {
    let opt = Opt::from_args();

    match opt.cmd {
        Cmd::ParseKanjidic => {
            let levels = parse_kanjidic().expect("error");
            for (level, chars) in levels.iter() {
                println!("{}: {}", level, chars.to_string());
            }
        }
        Cmd::ParseJlptVocab => {
            let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
            let all_kanji =
                Charset::from_iter(kanji_levels.iter().map(|(_, c)| c.chars()).flatten());
            parse_jlpt_vocab(&all_kanji).expect("error");
        }
        Cmd::New { truncate, count } => {
            let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
            let all_kanji = Charset::new(
                kanji_levels
                    .iter()
                    .map(|(_, x)| x.to_string())
                    .collect::<Vec<_>>()
                    .join(""),
            );
            let kanji_levels = kanji_levels
                .into_iter()
                .map(|(l, x)| (l, Charset::new(x)))
                .collect::<Vec<_>>();
            let mut ex = read_examples(&all_kanji).expect("read_examples");
            ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
            let mut batches: Vec<Batch> = fs::read("data/batches.json")
                .map_err(anyhow::Error::from)
                .and_then(|x| Ok(serde_json::from_slice(&x)?))
                .unwrap_or_default();
            if let Some(t) = truncate {
                batches.truncate(t);
            }
            println!("---- starting after {} batches ----", batches.len());
            let target_len = batches.len() + count;
            gen_batches(&mut batches, target_len, &kanji_levels, &ex);
            fs::write(
                "data/batches.json",
                serde_json::to_string_pretty(&batches)
                    .expect("serialize")
                    .as_bytes(),
            )
            .expect("save");
        }
        Cmd::Simplify => {
            let mut batches: Vec<Batch> = fs::read("data/batches.json")
                .map_err(anyhow::Error::from)
                .and_then(|x| Ok(serde_json::from_slice(&x)?))
                .expect("failed to decode batches.json");
            for batch in batches.iter_mut() {
                simplify_batch(batch);
            }
            fs::write(
                "data/batches.json",
                serde_json::to_string_pretty(&batches)
                    .expect("serialize")
                    .as_bytes(),
            )
            .expect("save");
        }
        Cmd::Cleanup => {
            let mut batches: Vec<Batch> = fs::read("data/batches.json")
                .map_err(anyhow::Error::from)
                .and_then(|x| Ok(serde_json::from_slice(&x)?))
                .expect("failed to decode batches.json");
            let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
            let kanji_levels = kanji_levels
                .into_iter()
                .map(|(l, x)| (l, Charset::new(x)))
                .collect::<Vec<_>>();
            cleanup_batches(&mut batches, &kanji_levels);
            fs::write(
                "data/batches.json",
                serde_json::to_string_pretty(&batches)
                    .expect("serialize")
                    .as_bytes(),
            )
            .expect("save");
        }
        Cmd::AddVocab => {
            let mut batches: Vec<Batch> = fs::read("data/batches.json")
                .map_err(anyhow::Error::from)
                .and_then(|x| Ok(serde_json::from_slice(&x)?))
                .expect("failed to decode batches.json");
            let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab");
            add_vocab(&mut batches, &jlpt_vocab);
            fs::write(
                "data/batches.json",
                serde_json::to_string_pretty(&batches)
                    .expect("serialize")
                    .as_bytes(),
            )
            .expect("save");
        }
        Cmd::AddExamples => {
            let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
            let all_kanji = Charset::new(
                kanji_levels
                    .iter()
                    .map(|(_, x)| x.to_string())
                    .collect::<Vec<_>>()
                    .join(""),
            );

            let mut ex = read_examples(&all_kanji).expect("read_examples");
            ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));

            let mut batches: Vec<Batch> = fs::read("data/batches.json")
                .map_err(anyhow::Error::from)
                .and_then(|x| Ok(serde_json::from_slice(&x)?))
                .expect("failed to decode batches.json");

            add_examples(&mut batches, &ex);

            fs::write(
                "data/batches.json",
                serde_json::to_string_pretty(&batches)
                    .expect("serialize")
                    .as_bytes(),
            )
            .expect("save");
        }
        Cmd::Format => {
            let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
            let jmdict = roxmltree::Document::parse_with_options(
                &jmdict,
                roxmltree::ParsingOptions {
                    allow_dtd: true,
                    ..Default::default()
                },
            )
            .expect("parse_jmdict");
            let jmdict_idx = index_jmdict(&jmdict);

            let batches = fs::read("data/batches.json")
                .map_err(anyhow::Error::from)
                .and_then(|x| Ok(serde_json::from_slice::<Vec<Batch>>(&x)?))
                .expect("read/parse");

            fs::create_dir_all("public").expect("mkdir public");
            fs::copy("static/style.css", "public/style.css").expect("copy style.css");

            batches
                .iter()
                .enumerate()
                .for_each(|x| format_batch(&jmdict_idx, batches.len(), x));

            let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
            format_index(&batches, &kanji_levels).expect("format_index");
            format_about().expect("format_about");
        }
    }
}

// =====================================================================
//                      PARSING DATA FILES
// =====================================================================

type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>;
fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
    let dict = dict
        .root()
        .children()
        .find(|x| x.has_tag_name("JMdict"))
        .unwrap();

    let mut ret: DictIndex<'a> = HashMap::new();
    for x in dict.children().filter(|x| x.has_tag_name("entry")) {
        for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
            if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
                let txt = keb.text().unwrap().trim();
                ret.entry(txt).or_default().push(x);
            }
        }
    }

    ret
}

fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
    let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());

    let file = fs::read_to_string("data/kanjidic2.xml")?;
    let xml = roxmltree::Document::parse(&file)?;
    let kanjidic = xml.root().first_child().unwrap();
    assert!(kanjidic.has_tag_name("kanjidic2"));

    let mut levels = HashMap::new();

    for x in kanjidic.children() {
        if !x.has_tag_name("character") {
            continue;
        }
        let mut literal = None;
        let mut jlpt = None;
        let mut grade = None;
        for y in x.children() {
            if y.has_tag_name("literal") {
                literal = y.text();
            }
            if y.has_tag_name("misc") {
                for z in y.children() {
                    if z.has_tag_name("jlpt") {
                        jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok());
                    }
                    if z.has_tag_name("grade") {
                        grade = z.text().and_then(|x| str::parse::<i32>(x).ok());
                    }
                }
            }
        }
        match grade {
            Some(i) if i <= 6 => grade = Some(7),
            _ => (),
        }
        if let Some(lit) = literal {
            assert_eq!(lit.chars().count(), 1);
            let jlpt = match jlpt {
                Some(4) => Some(5),
                Some(3) => Some(4),
                Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3),
                x => x,
            };
            levels
                .entry((jlpt, grade))
                .or_insert(String::new())
                .extend(lit.chars());
        }
    }

    let mut levels = levels.into_iter().collect::<Vec<_>>();
    levels.sort_by_key(|((j, g), _)| match (j, g) {
        (Some(j), Some(g)) => (10 - *j) * 20 + *g,
        (Some(j), None) => (10 - *j) * 20 + 15,
        (None, Some(g)) => 1000 + *g,
        (None, None) => 1015,
    });

    let mut ret = Vec::new();
    let mut pc = Charset::default();
    for ((j, g), chars) in levels.into_iter() {
        let name = match (j, g) {
            (Some(j), Some(7)) => format!("N{}a", j),
            (Some(j), Some(8)) => format!("N{}b", j),
            (Some(j), Some(g)) => format!("N{}-{}", j, g),
            (Some(j), None) => format!("N{}+", j),
            (None, Some(7)) => format!("N0a"),
            (None, Some(8)) => format!("N0b"),
            (None, Some(g)) => format!("N0-{}", g),
            (None, None) => format!("N0+"),
        };
        let chars = Charset::new(chars).diff(&pc);
        pc = pc.union(&chars);
        ret.push((name, chars));
    }

    Ok(ret)
}

fn read_kanji_levels() -> Result<Vec<(String, String)>> {
    Ok(fs::read_to_string("data/kanji_levels.txt")?
        .lines()
        .filter_map(|l| l.split_once(": "))
        .map(|(l, k)| (l.to_string(), k.to_string()))
        .collect::<Vec<_>>())
}

fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
    let file = fs::File::open("data/examples.utf")?;

    let mut ret = Vec::new();
    let mut a = "".to_string();

    for (i, line) in io::BufReader::new(file).lines().enumerate() {
        let line = line?;
        if line.starts_with("A:") {
            a = line;
        } else if line.starts_with("B:") {
            let s = a.strip_prefix("A: ");
            let t = line.strip_prefix("B: ");
            if let (Some(a), Some(b)) = (s, t) {
                if let Some((ja, eng)) = a.split_once("\t") {
                    if let Some((eng, id)) = eng.split_once("#") {
                        ret.push(Example {
                            ja: ja.to_string(),
                            en: eng.to_string(),
                            expl: b.to_string(),
                            id: Some(id.to_string()),
                            chars: Charset::new(ja).inter(all_kanji),
                        });
                    } else {
                        ret.push(Example {
                            ja: ja.to_string(),
                            en: eng.to_string(),
                            expl: b.to_string(),
                            id: None,
                            chars: Charset::new(ja).inter(all_kanji),
                        });
                    }
                }
            }
        }
        if i % 10000 == 0 {
            eprintln!("read examples: {}/300 (x1000)", i / 1000);
        }
    }

    Ok(ret)
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
struct JlptVocab {
    level: String,
    chars: Charset,
    kanji: String,
    kana: String,
    en: String,
}

impl JlptVocab {
    fn to_string(&self) -> String {
        format!(
            "{}\t{}\t{}\t{}\t{}",
            self.level,
            self.chars.to_string(),
            self.kanji,
            self.kana,
            self.en
        )
    }
}

fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
    let mut vocab = vec![];
    vocab.extend(parse_jlpt_vocab_combined(
        "data/n5_vocab.txt",
        "N5",
        all_kanji,
    )?);
    vocab.extend(parse_jlpt_vocab_split(
        "data/n4_vocab_hiragana.txt",
        "data/n4_vocab_eng.txt",
        "N4",
        all_kanji,
    )?);
    vocab.extend(parse_jlpt_vocab_split(
        "data/n3_vocab_hiragana.txt",
        "data/n3_vocab_eng.txt",
        "N3",
        all_kanji,
    )?);
    vocab.extend(parse_jlpt_vocab_split(
        "data/n2_vocab_hiragana.txt",
        "data/n2_vocab_eng.txt",
        "N2",
        all_kanji,
    )?);
    vocab.extend(parse_jlpt_vocab_split(
        "data/n1_vocab_hiragana.txt",
        "data/n1_vocab_eng.txt",
        "N1",
        all_kanji,
    )?);
    for v in vocab.iter() {
        println!("{}", v.to_string());
    }
    Ok(())
}

fn parse_jlpt_vocab_combined(
    file: &str,
    level: &str,
    all_kanji: &Charset,
) -> Result<Vec<JlptVocab>> {
    let lines = jlpt_vocab_read_file(file)?;
    let mut ret = vec![];
    for (kanji, answer) in lines {
        let (eng, kana) = match answer.split_once('\n') {
            Some((a, b)) => (a, b.trim()),
            None => (answer.trim(), ""),
        };
        for kanji in kanji.split('/') {
            ret.push(JlptVocab {
                level: level.to_string(),
                chars: Charset::new(kanji).inter(all_kanji),
                kanji: kanji.to_string(),
                kana: kana.to_string(),
                en: eng.to_string(),
            });
        }
    }
    Ok(ret)
}

fn parse_jlpt_vocab_split(
    kana_file: &str,
    eng_file: &str,
    level: &str,
    all_kanji: &Charset,
) -> Result<Vec<JlptVocab>> {
    let eng_lines = jlpt_vocab_read_file(eng_file)?
        .into_iter()
        .collect::<HashMap<String, String>>();

    let lines = jlpt_vocab_read_file(kana_file)?;
    let mut ret = vec![];
    for (kanji, kana) in lines {
        let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana));
        if let Some(eng) = eng {
            for kanji in kanji.split('/') {
                ret.push(JlptVocab {
                    level: level.to_string(),
                    chars: Charset::new(kanji).inter(all_kanji),
                    kanji: kanji.to_string(),
                    kana: kana.to_string(),
                    en: eng.to_string(),
                });
            }
        }
    }
    Ok(ret)
}

fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
    let re = regex::Regex::new(r#"<span class="\w+">"#)?;

    let file = fs::File::open(file)?;
    let mut ret = vec![];
    for line in io::BufReader::new(file).lines() {
        let line = line?.replace("<br>", "\n").replace("</span>", "");
        let line = re.replace_all(&line, "");
        if let Some((a, b)) = line.split_once('|') {
            ret.push((a.trim().to_string(), b.trim().to_string()));
        }
    }

    Ok(ret)
}

fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
    let file = fs::File::open("data/jlpt_vocab.txt")?;
    let mut ret = vec![];
    for line in io::BufReader::new(file).lines() {
        let line = line?;
        let line = line.splitn(5, "\t").collect::<Vec<_>>();
        if line.len() == 5 {
            ret.push(JlptVocab {
                level: line[0].to_string(),
                chars: Charset::new(line[1]),
                kanji: line[2].to_string(),
                kana: line[3].to_string(),
                en: line[4].to_string(),
            });
        }
    }
    Ok(ret)
}

// =====================================================================
//                      BATCH STRUCTURES AND GENERATION
// =====================================================================

const CHARS_PER_BATCH: usize = 20;
const MAX_NEW_CHARS_PER_EX: usize = 5;

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct Example {
    ja: String,
    en: String,
    expl: String,
    id: Option<String>,
    chars: Charset,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
struct Batch {
    level: String,
    chars: Charset,
    chars_p1: Charset,
    chars_p2: Charset,
    chars_bad: Charset,
    examples: Vec<Example>,
    #[serde(default)]
    extra_vocab: Vec<JlptVocab>,
    #[serde(default)]
    extra_examples: Vec<Example>,
}

fn gen_batches(
    batches: &mut Vec<Batch>,
    target_len: usize,
    kanji_levels: &[(String, Charset)],
    examples: &[Example],
) {
    let mut remainder = None;
    while batches.len() < target_len {
        let done = Charset::from_iter(
            batches
                .iter()
                .map(|x| x.chars.chars().iter().copied())
                .flatten(),
        );
        let remainder_chars = remainder
            .as_ref()
            .map(|x: &Batch| x.chars.clone())
            .unwrap_or_default();

        let remainder_before = remainder.clone();
        let len_before = batches.len();

        let mut advanced = false;
        for (i, (level, level_chars)) in kanji_levels.iter().enumerate() {
            let diff = level_chars.diff(&done).diff(&remainder_chars);
            if !diff.is_empty() {
                let avoid = Charset::from_iter(
                    kanji_levels
                        .iter()
                        .skip(i + 1)
                        .filter(|(l, _)| !l.ends_with("-9") && !l.ends_with("-10"))
                        .map(|(_, c)| c.chars().iter().copied())
                        .flatten(),
                );

                let level_examples = level_examples(&diff, &avoid, examples);
                let level_new_chars = Charset::from_iter(
                    level_examples
                        .iter()
                        .map(|x| x.chars.chars().iter().copied())
                        .flatten(),
                )
                .inter(&diff);
                println!(
                    "- {} ({} chars): {} done previously, {} diff, {} ex, {} new chars",
                    level,
                    level_chars.len(),
                    done.len(),
                    diff.len(),
                    level_examples.len(),
                    level_new_chars.len()
                );
                if !level_examples.is_empty() {
                    assert!(!level_new_chars.is_empty());
                    remainder = gen_level(
                        batches,
                        level,
                        &level_new_chars,
                        &done,
                        level_examples,
                        remainder,
                    );
                    advanced = true;
                    break;
                }
            }
        }
        if let Some(r) = &remainder {
            assert!(r.examples.len() <= 20);
        }

        if advanced && batches.len() == len_before && remainder == remainder_before {
            // restart level with new rng
            let last_level = batches.last().unwrap().level.to_string();
            println!("RESTARTING LEVEL {}, hopefully new RNG", last_level);
            while batches
                .last()
                .map(|x| x.level == last_level)
                .unwrap_or(false)
            {
                batches.pop();
                remainder = None;
            }
        } else if !advanced {
            break;
        }
    }
    if let Some(r) = remainder {
        if batches.len() < target_len {
            batches.push(r);
        }
    }
}

fn gen_level(
    batches: &mut Vec<Batch>,
    level: &str,
    new_chars: &Charset,
    prev_done: &Charset,
    mut examples: Vec<&Example>,
    mut remainder: Option<Batch>,
) -> Option<Batch> {
    examples.shuffle(&mut thread_rng());

    let remainder_chars = remainder.as_ref().map(|x| x.chars.len()).unwrap_or(0);
    println!(
        "Level {}: {} characters using {} examples, remainder has {} chars and {} examples",
        level,
        new_chars.len(),
        examples.len(),
        remainder_chars,
        remainder.as_ref().map(|x| x.examples.len()).unwrap_or(0),
    );

    let avg_len = examples.len() as f32 * CHARS_PER_BATCH as f32 / new_chars.len() as f32;
    let mut batch_count = 0;
    let mut sum_len = 0;

    let mut done = prev_done.union(
        remainder
            .as_ref()
            .map(|x| &x.chars)
            .unwrap_or(&Charset::default()),
    );

    loop {
        println!("iter with {} examples", examples.len());
        let mut batch = remainder.take().unwrap_or_else(|| Batch {
            level: level.to_string(),
            ..Default::default()
        });
        let remaining_chars = new_chars.diff(&done);
        let todo_chars = CHARS_PER_BATCH - batch.chars.len();
        if remaining_chars.len() <= todo_chars {
            for ex in examples.iter() {
                batch.examples.push((*ex).clone());
                batch.chars = batch.chars.union(&ex.chars.diff(&done).inter(&new_chars));
            }
            if batch.chars.len() == CHARS_PER_BATCH {
                println!(
                    "-> all remaining examples sum up to exaclty {} chars",
                    CHARS_PER_BATCH
                );
                batches.push(batch);
                return None;
            } else if batch.examples.is_empty() {
                assert!(batch.chars.is_empty());
                println!("-> done");
                return None;
            } else {
                assert!(batch.chars.len() < CHARS_PER_BATCH);
                println!(
                    "-> with all remaining examples, cannot make a full batch, only {} chars",
                    batch.chars.len()
                );
                return Some(batch);
            }
        }
        assert!(!examples.is_empty());

        println!(
            "Trying to add exactly {} characters, using {} examples containing {} new chars",
            todo_chars,
            examples.len(),
            remaining_chars.len()
        );

        // Compute dynamic algorithm matrix with a bunch of combinations that add `todo_chars`
        let mut dyn_mat: Vec<Vec<Option<(Charset, Option<(usize, usize)>)>>> = vec![];
        for ex in examples.iter() {
            let mut dyn_row = vec![None; todo_chars + 1];
            let chars_common = ex.chars.inter(&new_chars).diff(&done);
            if chars_common.len() > MAX_NEW_CHARS_PER_EX {
                dyn_mat.push(dyn_row);
                continue;
            }
            if chars_common.len() < dyn_row.len() {
                dyn_row[chars_common.len()] = Some((chars_common.clone(), None));
            }
            for (i, dyn_prev) in dyn_mat.iter().enumerate() {
                for (j, dpr) in dyn_prev.iter().enumerate() {
                    if let Some((chars_inter, _prev)) = dpr {
                        assert_eq!(chars_inter.len(), j);
                        let new_chars_common = chars_inter.union(&chars_common);
                        let new_chars_common_len = new_chars_common.len();
                        if new_chars_common_len > chars_inter.len()
                            && new_chars_common_len <= todo_chars
                        {
                            dyn_row[new_chars_common_len] = Some((new_chars_common, Some((i, j))));
                        }
                    }
                }
            }
            dyn_mat.push(dyn_row);
        }

        // Find combination that does that with a good number of examples (tgt_len)
        let tgt_len = (avg_len * (batch_count as f32 + 1.)).ceil() as i64
            - (sum_len + batch.examples.len()) as i64;
        let dyn_mat_cnt = |i| {
            let mut cnt = 0;
            let mut i: usize = i;
            let mut j: usize = todo_chars;
            loop {
                match &dyn_mat[i][j] {
                    None => return None,
                    Some((_, ij_prev)) => {
                        cnt += 1;
                        match ij_prev {
                            Some((iprev, jprev)) => {
                                i = *iprev;
                                j = *jprev;
                            }
                            None => return Some(cnt),
                        }
                    }
                }
            }
        };
        let i_opt = (0..dyn_mat.len())
            .filter_map(|pos| dyn_mat_cnt(pos).map(|cnt| (pos, cnt)))
            .min_by_key(|(_, cnt)| {
                let x = *cnt as i64 - tgt_len;
                x * x
            });
        let i = match i_opt {
            None => {
                println!(
                    "WARNING: cannot make exactly {} chars, interrupting",
                    todo_chars
                );
                return None;
            }
            Some((pos, _)) => pos,
        };

        // Take all examples from that combination and add them to current batch
        let (mut i, mut j) = (i, todo_chars);
        loop {
            match &dyn_mat[i][j] {
                None => panic!("dyn_mat[{}][{}] == None", i, j),
                Some((chars, ij_prev)) => {
                    println!(
                        "Add {}: {}",
                        examples[i].chars.inter(&chars).to_string(),
                        examples[i].ja
                    );
                    batch.examples.push(examples[i].clone());
                    examples.remove(i);
                    batch.chars = batch.chars.union(&chars);
                    match ij_prev {
                        Some((iprev, jprev)) => {
                            assert!(*iprev < i);
                            i = *iprev;
                            j = *jprev;
                        }
                        None => break,
                    }
                }
            }
        }
        assert_eq!(batch.chars.len(), CHARS_PER_BATCH);

        println!(
            "-> batch {:03}: {} with {} examples",
            batches.len(),
            batch.chars.to_string(),
            batch.examples.len()
        );
        batch_count += 1;
        done = done.union(&batch.chars);
        sum_len += batch.examples.len();
        batches.push(batch);
    }
}

fn level_examples<'a>(
    chars: &Charset,
    avoid: &Charset,
    all_examples: &'a [Example],
) -> Vec<&'a Example> {
    println!("Calculating examples for {}", chars.to_string());

    let mut todo = chars.clone();
    let mut bad = Charset::default();
    let mut examples = vec![];

    let cost = |ex: &Example, ex_todo_inter: usize, ex_chars_inter: usize| {
        (
            -(ex.chars.inter_len(&avoid) as i32),
            ex_todo_inter,
            ex_chars_inter,
            -(ex.ja.chars().count() as i32),
            ex.chars.len() + thread_rng().gen_range(0..5),
        )
    };
    let mut all_with_inter = all_examples
        .par_iter()
        .map(|ex| (ex, ex.chars.inter_len(&chars)))
        .map(|(ex, ex_chars_inter)| (ex, ex_chars_inter, ex_chars_inter))
        .collect::<Vec<_>>();

    while !todo.is_empty() {
        let best = all_with_inter
            .par_iter()
            .enumerate()
            .filter(|(_, (_, ex_todo_inter, _))| *ex_todo_inter > 0)
            //.filter(|(_, (_, _, ex_tgt_inter))| (1..=8).contains(ex_tgt_inter))
            .max_by_key(|(_, (ex, ex_todo_inter, ex_chars_inter))| {
                cost(*ex, *ex_todo_inter, *ex_chars_inter)
            });
        if let Some((i, (ex, ex_todo_inter, _))) = best {
            let ex = *ex;
            assert_eq!(*ex_todo_inter, ex.chars.inter(&todo).len());
            examples.push(ex);
            all_with_inter.remove(i);
            todo = todo.diff(&ex.chars);
            bad = bad.union(&ex.chars.inter(&avoid));
            all_with_inter
                .par_iter_mut()
                .for_each(|(ex2, ex_todo_inter, _)| {
                    if ex2.chars.inter_len(&ex.chars) > 0 {
                        *ex_todo_inter = ex2.chars.inter_len(&todo);
                    }
                });
        } else {
            break;
        }
    }
    if !todo.is_empty() {
        println!("MISSING: NO SENTENCES FOR {}", todo.to_string());
    }
    if !bad.is_empty() {
        println!("USED BAD CHARS: {}", bad.to_string());
    }
    examples
}

fn simplify_batch(batch: &mut Batch) {
    let mut char_cnt = HashMap::<char, usize>::new();
    for ex in batch.examples.iter() {
        for ch in batch.chars.inter(&ex.chars).chars() {
            *char_cnt.entry(*ch).or_default() += 1;
        }
    }

    loop {
        let i_opt = batch.examples.iter().position(|ex| {
            batch
                .chars
                .inter(&ex.chars)
                .chars()
                .iter()
                .all(|x| char_cnt[x] >= 2)
        });
        if let Some(i) = i_opt {
            println!(
                "Removing {} [{}]",
                batch.examples[i].ja,
                batch.examples[i].chars.to_string()
            );
            batch.examples.remove(i);
        } else {
            break;
        }
    }
}

fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]) {
    let mut chars_p1 = Charset::default();
    let mut chars_p2 = Charset::default();
    let mut done = Charset::default();

    for batch in all_batches.iter_mut() {
        let all_chars = Charset::from_iter(
            batch
                .examples
                .iter()
                .map(|x| x.chars.chars().iter().copied())
                .flatten(),
        );

        let mut levels = kanji_levels
            .iter()
            .filter(|(_, chars)| chars.inter_len(&batch.chars) > 0)
            .map(|(lvl, _)| lvl.to_string())
            .collect::<Vec<_>>();
        while levels.len() > 2 {
            levels.remove(1);
        }
        batch.level = levels.join("/");
        done = done.union(&batch.chars);
        batch.chars_bad = all_chars.diff(&done);
        batch.chars_p1 = all_chars.inter(&chars_p1);
        batch.chars_p2 = all_chars.inter(&chars_p2);

        chars_p2 = chars_p1;
        chars_p1 = batch.chars.clone();
    }
}

fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
    let match_level = |batch: &Batch, level: &str| {
        let n5 = batch.level.contains("N5");
        let n4 = batch.level.contains("N4");
        let n3 = batch.level.contains("N3");
        let n2 = batch.level.contains("N2");
        let n1 = batch.level.contains("N1");
        let n0 = batch.level.contains("N0");
        match level {
            "N5" => n5 || n4 || n3 || n2 || n1 || n0,
            "N4" => n4 || n3 || n2 || n1 || n0,
            "N3" => n3 || n2 || n1 || n0,
            "N2" => n2 || n1 || n0,
            "N1" => n1 || n0,
            "N0" => n0,
            _ => panic!("invalid vocab level {}", level),
        }
    };

    let mut done = Charset::default();
    let mut extra_vocab = vec![];
    for (i, batch) in all_batches.iter().enumerate() {
        let done_after = done.union(&batch.chars);

        let batch_extra_vocab = vocab
            .iter()
            .filter(|v| v.chars.inter_len(&batch.chars) > 0)
            .filter(|v| match_level(batch, &v.level))
            .filter(|v| v.chars.diff(&done_after).len() == 0)
            .filter(|v| {
                !all_batches[i..std::cmp::min(all_batches.len(), i + 10)]
                    .iter()
                    .any(|b| {
                        b.examples
                            .iter()
                            .any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))
                    })
            })
            .cloned()
            .collect::<Vec<_>>();
        extra_vocab.push(batch_extra_vocab);

        println!("---- BATCH #{:03} ----", i);
        for v in batch.extra_vocab.iter() {
            println!("{}", v.to_string());
        }

        done = done_after;
    }

    for (batch, vocab) in all_batches.iter_mut().zip(extra_vocab.into_iter()) {
        batch.extra_vocab = vocab;
    }
}

fn add_examples(all_batches: &mut [Batch], examples: &[Example]) {
    let mut chars = Charset::default();
    for (i, batch) in all_batches.iter_mut().enumerate() {
        chars = chars.union(&batch.chars);

        let candidates = examples
            .iter()
            .filter(|x| x.chars.inter_len(&batch.chars) > 0)
            .filter(|x| x.chars.diff(&chars).len() == 0)
            .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja));
        let mut cand_by_chars = HashMap::new();
        for c in candidates {
            cand_by_chars.insert(c.chars.to_string(), c.clone());
        }
        let mut candidates = cand_by_chars
            .into_iter()
            .map(|(_, ex)| ex)
            .collect::<Vec<_>>();
        candidates.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));

        batch.extra_examples.clear();
        let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.chars().iter().copied()).flatten());
        let mut in_extra = Charset::default();
        while batch.extra_examples.len() < 20 {
            let best = candidates.iter().enumerate()
                .max_by_key(|(_, ex)| (ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len()));
            if let Some((i, ex)) = best {
                batch.extra_examples.push(ex.clone());
                in_batch = in_batch.union(&ex.chars);
                in_extra = in_extra.union(&ex.chars);
                candidates.remove(i);
            } else {
                break;
            }
        }

        batch.extra_examples.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));

        println!("---- BATCH #{:03} ----", i);
        for ex in batch.extra_examples.iter() {
            println!("{} - {}", ex.ja, ex.en);
        }
    }
}

// =====================================================================
//                          FORMATTING TO HTML
// =====================================================================

fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
    format_batch_aux(dict_idx, count, i, batch).expect("format batch");
}

fn format_batch_aux<'a>(
    dict_idx: &DictIndex<'a>,
    count: usize,
    i: usize,
    batch: &Batch,
) -> Result<()> {
    let mut f = io::BufWriter::new(fs::File::create(format!("public/{:03}.html", i))?);
    write!(
        f,
        r#"<!DOCTYPE html>
        <html>
            <head>
                <meta charset=\"UTF-8\" />
                <title>Batch #{:03}</title>
                <link rel="stylesheet" type="text/css" href="style.css" />
            </head>
            <body><div class="batch_page">"#,
        i
    )?;

    writeln!(f, r#"<p><a href="index.html">index</a>"#)?;
    for j in 0..count {
        if j != i {
            writeln!(f, r#" <a href="{:03}.html">{:03}</a>"#, j, j)?;
        } else {
            writeln!(f, " {:03}", j)?;
        }
    }
    writeln!(f, r#"</p>"#)?;
    writeln!(f, "<p>Level: {}</p>", batch.level)?;

    write!(f, r#"<p class="ja">"#)?;
    let mut ex_prev = Charset::default();
    for ex in batch.examples.iter() {
        let ex_chars = ex.chars.inter(&batch.chars);
        for c in ex_chars.diff(&ex_prev).chars().iter() {
            write!(
                f,
                r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
                c, c
            )?;
        }
        ex_prev = ex_prev.union(&ex_chars);
    }
    writeln!(f, r#"</p>"#)?;

    for ex in batch.examples.iter() {
        writeln!(f, "<hr />")?;
        write!(f, r#"<p class="ja">"#)?;
        for c in ex.ja.chars() {
            if batch.chars.contains(c) {
                write!(f, r#"<span class="char_cur">{}</span>"#, c)?;
            } else if batch.chars_p1.contains(c) {
                write!(f, r#"<span class="char_p1">{}</span>"#, c)?;
            } else if batch.chars_p2.contains(c) {
                write!(f, r#"<span class="char_p2">{}</span>"#, c)?;
            } else if batch.chars_bad.contains(c) {
                write!(f, r#"<span class="char_bad">{}</span>"#, c)?;
            } else {
                write!(f, "{}", c)?;
            }
        }
        writeln!(f, "</p>")?;
        writeln!(f, r#"<p class="en">{}</p>"#, ex.en)?;

        writeln!(f, r#"<details><summary>Explanation</summary>"#)?;
        let mut expl_batch = Vec::new();
        let mut expl_all = Vec::new();
        for word in ex.expl.split(|c| c == ' ' || c == '~') {
            let (keb, reb) = expl_clean_word(word);
            let wchars = Charset::new(keb);
            if !wchars.intersects(&ex.chars) {
                continue;
            }
            if let Some(ents) = dict_idx.get(keb) {
                for ent in ents.iter() {
                    if let Some(s) = dict_str(keb, reb, ent) {
                        if wchars.intersects(&batch.chars) {
                            expl_batch.push(s);
                        } else {
                            expl_all.push(s);
                        }
                    }
                }
            }
        }
        for be in expl_batch {
            writeln!(f, r#"<p>{}</p>"#, be)?;
        }
        writeln!(f, r#"<p class="chars">"#)?;
        for c in ex.chars.inter(&batch.chars).chars().iter() {
            writeln!(
                f,
                r#"<a href="https://jisho.org/search/{}%20%23kanji">{}</a>"#,
                c, c
            )?;
        }
        writeln!(f, r#"</p>"#)?;
        for be in expl_all {
            writeln!(f, r#"<p>{}</p>"#, be)?;
        }
        writeln!(f, r#"</details>"#)?;
    }

    writeln!(f, "<hr />")?;
    format_vocab(
        &mut f,
        &batch
            .extra_vocab
            .iter()
            .filter(|v| batch.level.contains(&v.level))
            .collect::<Vec<_>>(),
        "Extra vocabulary (this level)",
    )?;
    format_vocab(
        &mut f,
        &batch
            .extra_vocab
            .iter()
            .filter(|v| !batch.level.contains(&v.level))
            .collect::<Vec<_>>(),
        "Extra vocabulary (previous levels)",
    )?;

    writeln!(
        f,
        r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"#
    )?;
    for ex in batch.extra_examples.iter() {
        writeln!(
            f,
            r#"<tr><td style="text-align: center"><span class="tab_large font_ja">&nbsp;&nbsp;{}&nbsp;&nbsp;</span><br />{}</td></tr>"#,
            ex.ja, ex.en
        )?;
    }
    writeln!(f, r#"</table></details>"#)?;

    writeln!(f, "<hr />")?;
    writeln!(f, "<p>\(≧▽≦)/</p>")?;

    write!(f, "<div></body></html>")?;
    f.flush()?;
    Ok(())
}

fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> {
    if !vocab.is_empty() {
        writeln!(
            f,
            r#"<details><summary>{}</summary><table class="vocabtable">"#,
            t
        )?;
        for v in vocab {
            writeln!(
                f,
                r#"<tr><td>{}</td><td>&nbsp;&nbsp;<span class="tab_large font_ja">{}</span>&nbsp;&nbsp;</td><td>{}</td><td class="font_ja">{}</td></tr>"#,
                v.level, v.kanji, v.en, v.kana
            )?;
        }
        writeln!(f, "</table></details>")?;
    }
    Ok(())
}

fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
    let mut ret = w;
    for delim in ['(', '{', '['] {
        if let Some((s, _)) = ret.split_once(delim) {
            ret = s;
        }
    }
    let p = w
        .split_once('(')
        .and_then(|(_, r)| r.split_once(')'))
        .map(|(p, _)| p);
    (ret, p)
}

fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> {
    let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
    let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
    let reb = reb.text().unwrap().trim();

    if qreb.map(|x| x != reb).unwrap_or(false) {
        return None;
    }

    let mut ret = format!("{} [{}]", qkeb, reb);

    for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
        if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
            ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
        }
    }

    if ret.chars().rev().next() == Some(';') {
        ret.pop();
    }
    Some(ret)
}

fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> {
    let mut f = io::BufWriter::new(fs::File::create("public/index.html")?);
    write!(
        f,
        r#"<!DOCTYPE html>
        <html>
            <head>
                <meta charset=\"UTF-8\" />
                <title>List of batches</title>
                <link rel="stylesheet" type="text/css" href="style.css" />
            </head>
            <body><div class="index_page">"#
    )?;

    writeln!(f, r#"<p><a href="about.html">About / How-to</a></p><hr />"#)?;

    writeln!(f, "<table>")?;
    writeln!(f, "<tr><th>Num</th><th>Level</th><th>Kanji</th><th>Examples</th><th>Lesson-1</th><th>Lesson-2</th><th>Ignore</th></tr>")?;
    for (i, batch) in batches.iter().enumerate() {
        writeln!(
            f,
            r#"<tr><td><a href="{:03}.html">{:03}</a></td><td>{}</td><td class="font_ja">{}</td><td>&nbsp;&nbsp;{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td><td class="font_ja">{}</td></tr>"#,
            i,
            i,
            batch.level,
            batch.chars.to_string(),
            batch.examples.len(),
            batch.chars_p1.to_string(),
            batch.chars_p2.to_string(),
            batch.chars_bad.to_string()
        )?;
    }
    writeln!(f, r#"</table>"#)?;

    writeln!(f, "<hr />")?;

    let all_chars = Charset::from_iter(
        batches
            .iter()
            .map(|x| x.chars.chars().iter().copied())
            .flatten(),
    );
    writeln!(f, "<table>")?;
    writeln!(
        f,
        r#"<tr><th>Level</th><th>Count</th><th width="60%">Kanji</th><th>Missing kanji</th></tr>"#
    )?;
    for (lvl, chars) in kanji_levels.iter() {
        if lvl == "N0+" || lvl.ends_with("-10") {
            continue;
        }
        let chars = Charset::new(chars);
        let missing = chars.diff(&all_chars);
        writeln!(
            f,
            r#"<tr><td>{}</td><td>{}</td><td class="font_ja">{}</td><td><span class="font_ja">{}</span> ({})</td></tr>"#,
            lvl,
            chars.len(),
            chars.to_string(),
            missing.to_string(),
            missing.len()
        )?;
    }
    writeln!(f, "</table>")?;

    write!(f, "</div></body></html>")?;
    f.flush()?;
    Ok(())
}

fn format_about() -> Result<()> {
    let mut f = io::BufWriter::new(fs::File::create("public/about.html")?);
    write!(
        f,
        r#"<!DOCTYPE html>
        <html>
            <head>
                <meta charset=\"UTF-8\" />
                <title>Datagengo README</title>
                <link rel="stylesheet" type="text/css" href="style.css" />
            </head>
            <body>"#
    )?;

    writeln!(f, r#"<div class="about_page">"#)?;
    writeln!(
        f,
        r#"<p><a href="index.html">Back to lessons</a></p><hr />"#
    )?;

    writeln!(
        f,
        "{}",
        markdown::to_html(&fs::read_to_string("README.md")?)
    )?;

    writeln!(f, r#"</div></body></html>"#)?;

    Ok(())
}