Browse Source

Refactor JSON cleaning

Michael Ekstrand 2 years ago
parent
commit
b75c458a60
6 changed files with 100 additions and 13 deletions
  1. 4
    2
      src/bin/clean-openlib.rs
  2. 2
    2
      src/bin/parse-marc.rs
  3. 79
    0
      src/cleaning/json.rs
  4. 5
    0
      src/cleaning/mod.rs
  5. 9
    8
      src/pgutils.rs
  6. 1
    1
      src/lib.rs

+ 4
- 2
src/bin/clean-openlib.rs

@@ -12,7 +12,7 @@ use std::path::PathBuf;
 use flate2::bufread::MultiGzDecoder;
 use indicatif::{ProgressBar, ProgressStyle};
 
-use bookdata::pgutils::write_encoded;
+use bookdata::cleaning::{write_pgencoded, clean_json};
 use bookdata::tsv::split_first;
 
 #[derive(StructOpt, Debug)]
@@ -23,15 +23,17 @@ struct Opt {
 }
 
 fn process<R: BufRead, W: Write>(src: &mut R, dst: &mut W) -> io::Result<()> {
+  let mut jsbuf = String::new();
   for line in src.lines() {
     let ls = line?;
     let (_ty, rest) = split_first(&ls).expect("bad line");
     let (key, rest) = split_first(rest).expect("bad line");
     let (_ver, rest) = split_first(rest).expect("bad line");
     let (_stamp, json) = split_first(rest).expect("bad line");
+    clean_json(json, &mut jsbuf);
     dst.write_all(key.as_bytes())?;
     dst.write_all(b"\t")?;
-    write_encoded(dst, json.as_bytes())?;
+    write_pgencoded(dst, jsbuf.as_bytes())?;
     dst.write_all(b"\n")?;
   }
 

+ 2
- 2
src/bin/parse-marc.rs

@@ -15,7 +15,7 @@ use quick_xml::events::Event;
 use flate2::bufread::MultiGzDecoder;
 use indicatif::{ProgressBar, ProgressStyle};
 
-use bookdata::pgutils::write_encoded;
+use bookdata::cleaning::write_pgencoded;
 use bookdata::tsv::split_first;
 
 #[derive(StructOpt, Debug)]
@@ -152,7 +152,7 @@ fn process_record<B: BufRead, W: Write>(rdr: &mut Reader<B>, out: &mut W, lno: &
       Ok(Event::Text(e)) => {
         if output {
           let t = e.unescaped().expect("decode error");
-          write_encoded(out, &t).expect("output error")
+          write_pgencoded(out, &t).expect("output error")
         }
       },
       Ok(Event::Eof) => break,

+ 79
- 0
src/cleaning/json.rs

@@ -0,0 +1,79 @@
+/// Some of our JSON objects have \u0000, which PostgreSQL doesn't like.
+/// Clean those out while we import.
+/// This uses a reusable buffer to reduce allocations.
+/// 
+/// ```
+/// use bookdata::cleaning::clean_json;
+/// let mut buf = String::new();
+/// clean_json("some bad text\\u0000 continued", &mut buf);
+/// assert_eq!(buf, "some bad text continued");
+/// ```
+pub fn clean_json(json: &str, buf: &mut String) {
+  let bad = "\\u0000";
+  buf.clear();
+  let mut cur = json;
+  loop {
+    match cur.find(bad) {
+      None => {
+        buf.push_str(cur);
+        break
+      },
+      Some(i) => {
+        buf.push_str(&cur[0..i]);
+        cur = &cur[(i + bad.len())..]
+      }
+    }
+  }
+}
+
+#[test]
+fn clean_empty_is_empty() {
+  let mut buf = String::new();
+  clean_json("", &mut buf);
+  assert_eq!(buf, "");
+}
+
+#[test]
+fn clean_only_is_empty() {
+  let mut buf = String::new();
+  clean_json("\\u0000", &mut buf);
+  assert_eq!(buf, "");
+}
+
+#[test]
+fn clean_trail() {
+  let mut buf = String::new();
+  clean_json("bob\\u0000", &mut buf);
+  assert_eq!(buf, "bob");
+}
+
+#[test]
+fn clean_lead() {
+  let mut buf = String::new();
+  clean_json("\\u0000wombat", &mut buf);
+  assert_eq!(buf, "wombat");
+}
+
+#[test]
+fn clean_middle() {
+  let mut buf = String::new();
+  clean_json("heffalump\\u0000wumpus", &mut buf);
+  assert_eq!(buf, "heffalumpwumpus");
+}
+
+#[test]
+fn clean_multi() {
+  let mut buf = String::new();
+  clean_json("bob\\u0000dylan\\u0000fish", &mut buf);
+  assert_eq!(buf, "bobdylanfish");
+}
+
+
+#[test]
+fn clean_reuse_buffer() {
+  let mut buf = String::new();
+  clean_json("bob\\u0000dylan\\u0000fish", &mut buf);
+  assert_eq!(buf, "bobdylanfish");
+  clean_json("pizza fish", &mut buf);
+  assert_eq!(buf, "pizza fish");
+}

+ 5
- 0
src/cleaning/mod.rs

@@ -0,0 +1,5 @@
+mod pg;
+mod json;
+
+pub use self::pg::write_pgencoded;
+pub use self::json::clean_json;

+ 9
- 8
src/pgutils.rs

@@ -1,7 +1,8 @@
 use std::io::{self, Write};
 use std::str;
 
-pub fn write_encoded<W: Write>(w: &mut W, buf: &[u8]) -> io::Result<()> {
+/// Write text with PostgreSQL text format encoding.
+pub fn write_pgencoded<W: Write>(w: &mut W, buf: &[u8]) -> io::Result<()> {
   let mut start = 0;
   for i in 0..buf.len() {
     match buf[i] {
@@ -36,7 +37,7 @@ pub fn write_encoded<W: Write>(w: &mut W, buf: &[u8]) -> io::Result<()> {
 #[test]
 fn it_writes_empty() {
   let mut vec = Vec::new();
-  write_encoded(&mut vec, b"").unwrap();
+  write_pgencoded(&mut vec, b"").unwrap();
 
   assert_eq!(vec.len(), 0);
 }
@@ -44,7 +45,7 @@ fn it_writes_empty() {
 #[test]
 fn it_writes_str() {
   let mut vec = Vec::new();
-  write_encoded(&mut vec, b"foo").unwrap();
+  write_pgencoded(&mut vec, b"foo").unwrap();
 
   assert_eq!(str::from_utf8(&vec).unwrap(), "foo");
 }
@@ -52,7 +53,7 @@ fn it_writes_str() {
 #[test]
 fn encode_backslash() {
   let mut vec = Vec::new();
-  write_encoded(&mut vec, b"\\").unwrap();
+  write_pgencoded(&mut vec, b"\\").unwrap();
 
   assert_eq!(str::from_utf8(&vec).unwrap(), "\\\\");
 }
@@ -60,7 +61,7 @@ fn encode_backslash() {
 #[test]
 fn encode_tab() {
   let mut vec = Vec::new();
-  write_encoded(&mut vec, b"\t").unwrap();
+  write_pgencoded(&mut vec, b"\t").unwrap();
 
   assert_eq!(str::from_utf8(&vec).unwrap(), "\\t");
 }
@@ -68,7 +69,7 @@ fn encode_tab() {
 #[test]
 fn encode_nl() {
   let mut vec = Vec::new();
-  write_encoded(&mut vec, b"\n").unwrap();
+  write_pgencoded(&mut vec, b"\n").unwrap();
 
   assert_eq!(str::from_utf8(&vec).unwrap(), "\\n");
 }
@@ -76,7 +77,7 @@ fn encode_nl() {
 #[test]
 fn skip_cr() {
   let mut vec = Vec::new();
-  write_encoded(&mut vec, b"\r").unwrap();
+  write_pgencoded(&mut vec, b"\r").unwrap();
 
   assert_eq!(str::from_utf8(&vec).unwrap(), "");
 }
@@ -84,7 +85,7 @@ fn skip_cr() {
 #[test]
 fn embedded() {
   let mut vec = Vec::new();
-  write_encoded(&mut vec, b"foo\nbar\\wombat").unwrap();
+  write_pgencoded(&mut vec, b"foo\nbar\\wombat").unwrap();
 
   assert_eq!(str::from_utf8(&vec).unwrap(), "foo\\nbar\\\\wombat");
 }

+ 1
- 1
src/lib.rs

@@ -1,2 +1,2 @@
-pub mod pgutils;
+pub mod cleaning;
 pub mod tsv;