Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

clean-openlib.rs 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
  1. extern crate structopt;
  2. extern crate flate2;
  3. extern crate indicatif;
  4. extern crate bookdata;
  5. use std::io::prelude::*;
  6. use std::io::{self, BufReader};
  7. use structopt::StructOpt;
  8. use std::fs::File;
  9. use std::path::PathBuf;
  10. use flate2::bufread::MultiGzDecoder;
  11. use indicatif::{ProgressBar, ProgressStyle};
  12. use bookdata::cleaning::{write_pgencoded, clean_json};
  13. use bookdata::tsv::split_first;
  14. #[derive(StructOpt, Debug)]
  15. #[structopt(name="clean-openlib")]
  16. struct Opt {
  17. #[structopt(name = "INPUT", parse(from_os_str))]
  18. infile: PathBuf
  19. }
  20. fn process<R: BufRead, W: Write>(src: &mut R, dst: &mut W) -> io::Result<()> {
  21. let mut jsbuf = String::new();
  22. for line in src.lines() {
  23. let ls = line?;
  24. let (_ty, rest) = split_first(&ls).expect("bad line");
  25. let (key, rest) = split_first(rest).expect("bad line");
  26. let (_ver, rest) = split_first(rest).expect("bad line");
  27. let (_stamp, json) = split_first(rest).expect("bad line");
  28. clean_json(json, &mut jsbuf);
  29. dst.write_all(key.as_bytes())?;
  30. dst.write_all(b"\t")?;
  31. write_pgencoded(dst, jsbuf.as_bytes())?;
  32. dst.write_all(b"\n")?;
  33. }
  34. Ok(())
  35. }
  36. fn main() -> io::Result<()> {
  37. let opt = Opt::from_args();
  38. let stdout = io::stdout();
  39. let mut out = stdout.lock();
  40. let fs = File::open(opt.infile)?;
  41. let pb = ProgressBar::new(fs.metadata()?.len());
  42. pb.set_style(ProgressStyle::default_bar().template("{elapsed_precise} {bar} {percent}% {bytes}/{total_bytes} (eta: {eta})"));
  43. let pbr = pb.wrap_read(fs);
  44. let pbr = BufReader::new(pbr);
  45. let gzf = MultiGzDecoder::new(pbr);
  46. let mut bfs = BufReader::new(gzf);
  47. process(&mut bfs, &mut out)
  48. }
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...