aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cargo.lock7
-rw-r--r--Cargo.toml2
-rw-r--r--src/datafiles.rs42
-rw-r--r--src/example.rs24
-rw-r--r--src/format.rs28
-rw-r--r--src/main.rs4
-rw-r--r--src/server.rs39
-rw-r--r--static/script.js6
8 files changed, 84 insertions, 68 deletions
diff --git a/Cargo.lock b/Cargo.lock
index df12506..613b503 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -480,6 +480,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa"
[[package]]
+name = "cap"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f125eb85b84a24c36b02ed1d22c9dd8632f53b3cde6e4d23512f94021030003"
+
+[[package]]
name = "cc"
version = "1.0.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -674,6 +680,7 @@ dependencies = [
"anyhow",
"async-channel 2.2.0",
"async-std",
+ "cap",
"fasthash",
"futures",
"http-types",
diff --git a/Cargo.toml b/Cargo.toml
index 11b7e6a..db5359f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,3 +22,5 @@ futures = "0.3"
async-std = { version = "1.6.0", features = ["attributes"] }
http-types = "2.12"
async-channel = "2.0"
+
+cap = "0.1.2"
diff --git a/src/datafiles.rs b/src/datafiles.rs
index 7a6a5d5..fc6194f 100644
--- a/src/datafiles.rs
+++ b/src/datafiles.rs
@@ -1,6 +1,7 @@
use std::collections::HashMap;
use std::fs;
use std::io::{self, BufRead};
+use std::sync::Arc;
use anyhow::Result;
use serde::{Deserialize, Serialize};
@@ -22,21 +23,46 @@ pub struct Example {
// PARSING DATA FILES
// =====================================================================
-pub type DictEntry<'a> = roxmltree::Node<'a, 'a>;
-pub type DictIndex<'a> = HashMap<&'a str, Vec<DictEntry<'a>>>;
-pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
+pub struct DictEntry {
+ pub reb: String,
+ pub ent_seq: String,
+ pub sense: Box<[String]>,
+}
+
+pub type DictIndex = HashMap<String, Vec<Arc<DictEntry>>>;
+pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex {
let dict = dict
.root()
.children()
.find(|x| x.has_tag_name("JMdict"))
.unwrap();
- let mut ret: DictIndex<'a> = HashMap::new();
- for x in dict.children().filter(|x| x.has_tag_name("entry")) {
- for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
+ let mut ret: DictIndex = HashMap::new();
+ for ent in dict.children().filter(|x| x.has_tag_name("entry")) {
+ let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+ let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+ let reb = reb.text().unwrap().trim().to_string();
+
+ let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
+ let ent_seq = ent_seq.text().unwrap().trim().to_string();
+
+ let sense = ent
+ .children()
+ .filter(|x| x.has_tag_name("sense"))
+ .filter_map(|sense| sense.children().find(|x| x.has_tag_name("gloss")))
+ .map(|gloss| gloss.text().unwrap().trim().to_string())
+ .collect::<Vec<_>>()
+ .into_boxed_slice();
+ let parsed_ent = Arc::new(DictEntry {
+ reb,
+ ent_seq,
+ sense,
+ });
+
+ for r in ent.children().filter(|x| x.has_tag_name("k_ele")) {
if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
- let txt = keb.text().unwrap().trim();
- ret.entry(txt).or_default().push(x);
+ let txt = keb.text().unwrap().trim().to_string();
+ ret.entry(txt).or_default().push(parsed_ent.clone());
}
}
}
diff --git a/src/example.rs b/src/example.rs
index 7d20a28..c52cc8f 100644
--- a/src/example.rs
+++ b/src/example.rs
@@ -4,11 +4,7 @@ use crate::charset::Charset;
use crate::*;
impl Example {
- pub fn gen_furigana<'a>(
- &mut self,
- dict_idx: &DictIndex<'a>,
- overrides: &HashMap<String, String>,
- ) {
+ pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap<String, String>) {
use std::fmt::Write;
if let Some(v) = overrides.get(&self.ja) {
@@ -51,13 +47,11 @@ impl Example {
let reb = match reb {
Some(reb) if reb.starts_with('#') => {
let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
- if let Some(ent) = ents.iter().find(|ent| {
- let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
- ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap()
- }) {
- let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
- reb.text().unwrap().trim()
+ if let Some(ent) = ents
+ .iter()
+ .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap())
+ {
+ ent.reb.as_str()
} else {
println!("- entry id not found: {}", reb);
ret += &word;
@@ -69,11 +63,7 @@ impl Example {
let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
let matches = ents
.iter()
- .map(|ent| {
- let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
- reb.text().unwrap().trim()
- })
+ .map(|ent| ent.reb.as_str())
.collect::<HashSet<_>>();
if matches.len() == 1 {
*matches.iter().next().unwrap()
diff --git a/src/format.rs b/src/format.rs
index b9da487..3f10f8f 100644
--- a/src/format.rs
+++ b/src/format.rs
@@ -11,16 +11,16 @@ use crate::*;
// FORMATTING TO HTML
// =====================================================================
-pub fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
+pub fn format_batch(dict_idx: &DictIndex, count: usize, (i, batch): (usize, &Batch)) {
let mut f = io::BufWriter::new(
fs::File::create(format!("public/{:03}.html", i)).expect("create batch file"),
);
format_batch_to(&mut f, dict_idx, count, i, batch).expect("format batch");
}
-pub fn format_batch_to<'a>(
+pub fn format_batch_to(
buf: &mut impl Write,
- dict_idx: &DictIndex<'a>,
+ dict_idx: &DictIndex,
count: usize,
i: usize,
batch: &Batch,
@@ -200,32 +200,22 @@ fn format_vocab(buf: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<(
Ok(())
}
-fn dict_str_short<'a>(
- qkeb: &str,
- qreb: Option<&str>,
- ent: &roxmltree::Node<'a, 'a>,
-) -> Option<String> {
- let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
- let reb = reb.text().unwrap().trim();
-
- if qreb.map(|x| x != reb).unwrap_or(false) {
+fn dict_str_short<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option<String> {
+ if qreb.map(|x| x != ent.reb).unwrap_or(false) {
return None;
}
Some(format!(
r#"<span class="font_ja">{} 【{}】</span>"#,
- qkeb, reb
+ qkeb, ent.reb
))
}
-fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> {
+fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option<String> {
let mut ret = dict_str_short(qkeb, qreb, ent)?;
- for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
- if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
- ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
- }
+ for sense in ent.sense.iter() {
+ ret += &format!(" {};", sense);
}
if ret.chars().rev().next() == Some(';') {
diff --git a/src/main.rs b/src/main.rs
index 1ad5e77..b59669d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -17,6 +17,10 @@ use charset::Charset;
use datafiles::*;
use format::*;
+#[global_allocator]
+static ALLOCATOR: cap::Cap<std::alloc::System> =
+ cap::Cap::new(std::alloc::System, usize::max_value());
+
#[derive(Debug, StructOpt)]
#[structopt(name = "datagengo", about = "Japanese example practice maker")]
struct Opt {
diff --git a/src/server.rs b/src/server.rs
index 5e5d61b..14368e0 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -36,13 +36,14 @@ pub async fn server_main() -> tide::Result<()> {
eprintln!("Loading furigana overrides...");
let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");
+ eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
eprintln!("Loading JMdict_e.xml...");
let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
- let jmdict_raw: &'static str = String::leak(jmdict_raw);
+ eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
eprintln!("Parsing JMdict_e.xml...");
- let jmdict = roxmltree::Document::parse_with_options(
+ let jmdict_xml = roxmltree::Document::parse_with_options(
&jmdict_raw,
roxmltree::ParsingOptions {
allow_dtd: true,
@@ -50,10 +51,14 @@ pub async fn server_main() -> tide::Result<()> {
},
)
.expect("parse_jmdict");
- let jmdict_xml = Box::leak(Box::new(jmdict));
+ eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
eprintln!("Indexing JMdict_e.xml...");
- let jmdict_idx = index_jmdict(jmdict_xml);
+ let jmdict_idx = index_jmdict(&jmdict_xml);
+ eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
+ drop(jmdict_xml);
+ drop(jmdict_raw);
+ eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
eprintln!("Loading batches.json...");
let batches = read_batches().expect("read/parse");
@@ -66,8 +71,6 @@ pub async fn server_main() -> tide::Result<()> {
// ---- setup http server ----
let state: State = Box::leak(Box::new(StateStruct {
- jmdict_raw,
- jmdict_xml,
jmdict_idx,
batches,
index,
@@ -99,9 +102,7 @@ pub async fn server_main() -> tide::Result<()> {
type State = &'static StateStruct;
#[allow(dead_code)]
struct StateStruct {
- jmdict_raw: &'static str,
- jmdict_xml: &'static roxmltree::Document<'static>,
- jmdict_idx: DictIndex<'static>,
+ jmdict_idx: DictIndex,
batches: &'static [Batch],
index: &'static str,
examples: &'static [Example],
@@ -233,13 +234,7 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result {
}
if let Some(ents) = state.jmdict_idx.get(keb) {
for ent in ents.iter() {
- let ent_r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
- let ent_reb = ent_r_ele
- .children()
- .find(|x| x.has_tag_name("reb"))
- .unwrap();
- let ent_reb = ent_reb.text().unwrap().trim();
- if reb.map(|x| x != ent_reb).unwrap_or(false) {
+ if reb.map(|x| x != ent.reb).unwrap_or(false) {
continue;
}
expl += &format!(
@@ -247,17 +242,15 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result {
keb
);
- for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
- if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
- if !expl.ends_with('>') {
- expl += "; ";
- }
- expl += s.text().unwrap().trim();
+ for sense in ent.sense.iter() {
+ if !expl.ends_with('>') {
+ expl += "; ";
}
+ expl += sense;
}
expl += &format!(
r#"</td><td style="word-break: keep-all" class="tab_large font_ja">{}</td></tr>"#,
- ent_reb
+ ent.reb
);
}
}
diff --git a/static/script.js b/static/script.js
index 639e743..10db48c 100644
--- a/static/script.js
+++ b/static/script.js
@@ -37,6 +37,10 @@ function display_example(i) {
}
}
+function scrollTop() {
+ window.scrollTo(0, 0);
+}
+
function spacebar() {
if (revealed) {
console.log("Next item");
@@ -46,5 +50,5 @@ function spacebar() {
$("#gen_section").removeClass("gen_hidden");
revealed = true;
}
- window.scrollTo(0, 0);
+ window.setTimeout(scrollTop, 1);
}