Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

NerDLPipeline.scala 1.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  1. import com.johnsnowlabs.nlp.annotator._
  2. import com.johnsnowlabs.nlp.annotators.ner.NerConverter
  3. import com.johnsnowlabs.nlp.base._
  4. import com.johnsnowlabs.util.Benchmark
  5. import org.apache.spark.ml.Pipeline
  6. import org.apache.spark.sql.SparkSession
  7. object NerDLPipeline extends App {
  8. val spark: SparkSession = SparkSession
  9. .builder()
  10. .appName("test")
  11. .master("local[*]")
  12. .config("spark.driver.memory", "12G")
  13. .config("spark.kryoserializer.buffer.max","200M")
  14. .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
  15. .getOrCreate()
  16. import spark.implicits._
  17. spark.sparkContext.setLogLevel("WARN")
  18. val document = new DocumentAssembler()
  19. .setInputCol("text")
  20. .setOutputCol("document")
  21. val token = new Tokenizer()
  22. .setInputCols("document")
  23. .setOutputCol("token")
  24. val normalizer = new Normalizer()
  25. .setInputCols("token")
  26. .setOutputCol("normal")
  27. val wordEmbeddings = WordEmbeddingsModel.pretrained()
  28. .setInputCols("document", "token")
  29. .setOutputCol("word_embeddings")
  30. val ner = NerDLModel.pretrained()
  31. .setInputCols("normal", "document")
  32. .setOutputCol("ner")
  33. val nerConverter = new NerConverter()
  34. .setInputCols("document", "normal", "ner")
  35. .setOutputCol("ner_converter")
  36. val finisher = new Finisher()
  37. .setInputCols("ner", "ner_converter")
  38. .setIncludeMetadata(true)
  39. .setOutputAsArray(false)
  40. .setCleanAnnotations(false)
  41. .setAnnotationSplitSymbol("@")
  42. .setValueSplitSymbol("#")
  43. val pipeline = new Pipeline().setStages(Array(document, token, normalizer, wordEmbeddings, ner, nerConverter, finisher))
  44. val testing = Seq(
  45. (1, "Google is a famous company"),
  46. (2, "Peter Parker is a super heroe")
  47. ).toDS.toDF( "_id", "text")
  48. val result = pipeline.fit(Seq.empty[String].toDS.toDF("text")).transform(testing)
  49. Benchmark.time("Time to convert and show") {result.select("ner", "ner_converter").show(truncate=false)}
  50. }
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...