Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

clean-json.rs 2.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
  1. extern crate structopt;
  2. extern crate flate2;
  3. extern crate indicatif;
  4. extern crate bookdata;
  5. use std::io::prelude::*;
  6. use std::io::{self, BufReader};
  7. use structopt::StructOpt;
  8. use std::fs::File;
  9. use std::path::PathBuf;
  10. use flate2::bufread::MultiGzDecoder;
  11. use indicatif::{ProgressBar, ProgressStyle};
  12. use bookdata::cleaning::{write_pgencoded, clean_json};
  13. use bookdata::tsv::split_first;
  14. /// Process OpenLib data into format suitable for PostgreSQL import.
  15. #[derive(StructOpt, Debug)]
  16. #[structopt(name="clean-openlib")]
  17. struct Opt {
  18. /// Parse OpenLib files instead of straight JSON lines
  19. #[structopt(long="openlib")]
  20. openlib: bool,
  21. /// Input file
  22. #[structopt(name = "INPUT", parse(from_os_str))]
  23. infile: PathBuf
  24. }
  25. fn process_openlib<R: BufRead, W: Write>(src: &mut R, dst: &mut W) -> io::Result<()> {
  26. let mut jsbuf = String::new();
  27. for line in src.lines() {
  28. let ls = line?;
  29. let (_ty, rest) = split_first(&ls).expect("bad line");
  30. let (key, rest) = split_first(rest).expect("bad line");
  31. let (_ver, rest) = split_first(rest).expect("bad line");
  32. let (_stamp, json) = split_first(rest).expect("bad line");
  33. clean_json(json, &mut jsbuf);
  34. dst.write_all(key.as_bytes())?;
  35. dst.write_all(b"\t")?;
  36. write_pgencoded(dst, jsbuf.as_bytes())?;
  37. dst.write_all(b"\n")?;
  38. }
  39. Ok(())
  40. }
  41. fn process_raw<R: BufRead, W: Write>(src: &mut R, dst: &mut W) -> io::Result<()> {
  42. let mut jsbuf = String::new();
  43. for line in src.lines() {
  44. let json = line?;
  45. clean_json(&json, &mut jsbuf);
  46. write_pgencoded(dst, jsbuf.as_bytes())?;
  47. dst.write_all(b"\n")?;
  48. }
  49. Ok(())
  50. }
  51. fn main() -> io::Result<()> {
  52. let opt = Opt::from_args();
  53. let stdout = io::stdout();
  54. let mut out = stdout.lock();
  55. let fs = File::open(opt.infile)?;
  56. let pb = ProgressBar::new(fs.metadata()?.len());
  57. pb.set_style(ProgressStyle::default_bar().template("{elapsed_precise} {bar} {percent}% {bytes}/{total_bytes} (eta: {eta})"));
  58. let pbr = pb.wrap_read(fs);
  59. let pbr = BufReader::new(pbr);
  60. let gzf = MultiGzDecoder::new(pbr);
  61. let mut bfs = BufReader::new(gzf);
  62. if opt.openlib {
  63. process_openlib(&mut bfs, &mut out)
  64. } else {
  65. process_raw(&mut bfs, &mut out)
  66. }
  67. }
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...