Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

parse-page-sections-into-records.js 2.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  1. // This module takes cheerio page object and divides it into sections
  2. // using H1,h2,h3 heading elements as section delimiters. The text
  3. // that follows each heading becomes the content of the search record.
  4. const { chain } = require('lodash')
  5. const urlPrefix = 'https://docs.github.com'
  6. const ignoredHeadingSlugs = [
  7. 'in-this-article',
  8. 'further-reading'
  9. ]
  10. const { maxContentLength } = require('./config')
  11. module.exports = function parsePageSectionsIntoRecords (href, $) {
  12. const title = $('h1').text().trim()
  13. const breadcrumbsArray = $('nav.breadcrumbs a')
  14. .map((i, el) => {
  15. return $(el)
  16. .text()
  17. .trim()
  18. .replace(/\n/g, ' ')
  19. .replace(/\s+/g, ' ')
  20. })
  21. .get()
  22. .slice(0, -1)
  23. const breadcrumbs = breadcrumbsArray.join(' / ') || ''
  24. const metaKeywords = $('meta[name="keywords"]').attr('content')
  25. const topics = metaKeywords ? metaKeywords.split(',') : []
  26. const productName = breadcrumbsArray[0] || ''
  27. topics.push(productName)
  28. // Remove "github" to make filter queries shorter
  29. if (productName.includes('GitHub ')) {
  30. topics.push(productName.replace('GitHub ', ''))
  31. }
  32. let records
  33. const $sections = $('.article-grid-body h3')
  34. .filter('[id]')
  35. .filter((i, el) => {
  36. return !ignoredHeadingSlugs.includes($(el).attr('id'))
  37. })
  38. if ($sections.length > 0) {
  39. records = $sections
  40. .map((i, el) => {
  41. const heading = $(el).text().trim()
  42. const slug = $(el).attr('id')
  43. const objectID = [href, slug].join('#')
  44. const url = [urlPrefix, objectID].join('')
  45. const content = $(el)
  46. // Platform-specific content is nested in a DIV
  47. // GraphQL content in nested in two DIVS
  48. .nextUntil('h2, h3, div > h2, div > h3, div > div > h2, div > div > h3')
  49. .map((i, el) => $(el).text())
  50. .get()
  51. .join(' ')
  52. .trim()
  53. .slice(0, maxContentLength)
  54. return {
  55. objectID,
  56. url,
  57. slug,
  58. breadcrumbs,
  59. heading,
  60. title,
  61. content,
  62. topics
  63. }
  64. })
  65. .get()
  66. } else {
  67. // There are no sections. Treat the entire article as the record.
  68. const objectID = href
  69. const url = [urlPrefix, objectID].join('')
  70. const content = $('.article-grid-body p, .article-grid-body ul, .article-grid-body ol, .article-grid-body table')
  71. .map((i, el) => $(el).text())
  72. .get()
  73. .join(' ')
  74. .trim()
  75. .slice(0, maxContentLength)
  76. records = [{
  77. objectID,
  78. url,
  79. breadcrumbs,
  80. title,
  81. content,
  82. topics
  83. }]
  84. }
  85. return chain(records)
  86. .uniqBy('objectID')
  87. .value()
  88. }
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...