Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

build-records.js 2.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  1. const domwaiter = require('domwaiter')
  2. const eventToPromise = require('event-to-promise')
  3. const chalk = require('chalk')
  4. const parsePageSectionsIntoRecords = require('./parse-page-sections-into-records')
  5. const languages = require('../../lib/languages')
  6. const pageMarker = chalk.green('|')
  7. const recordMarker = chalk.grey('.')
  8. const port = 4002
  9. module.exports = async function buildRecords (indexName, indexablePages, pageVersion, languageCode) {
  10. console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`)
  11. const records = []
  12. const pages = indexablePages
  13. // exclude pages that are not in the current language
  14. .filter(page => page.languageCode === languageCode)
  15. // exclude pages that don't have a permalink for the current product version
  16. .filter(page => page.permalinks.some(permalink => permalink.pageVersion === pageVersion))
  17. // Find the approve permalink for the given language and GitHub product variant (dotcom v enterprise)
  18. const permalinks = pages
  19. .map(page => {
  20. return page.permalinks.find(permalink => {
  21. return permalink.languageCode === languageCode && permalink.pageVersion === pageVersion
  22. })
  23. })
  24. .map(permalink => {
  25. permalink.url = `http://localhost:${port}${permalink.href}`
  26. return permalink
  27. })
  28. console.log('indexable pages', indexablePages.length)
  29. console.log('pages in index', pages.length)
  30. console.log('permalinks in index', permalinks.length)
  31. console.log(pageMarker, 'denotes pages')
  32. console.log(recordMarker, 'denotes records derived from sections of pages')
  33. const waiter = domwaiter(permalinks, { maxConcurrent: 200, minTime: 5 })
  34. .on('page', (page) => {
  35. process.stdout.write(pageMarker)
  36. const newRecords = parsePageSectionsIntoRecords(page.href, page.$)
  37. if (!newRecords.length) {
  38. console.log(chalk.red(`\nno records found: ${page.href}`))
  39. }
  40. process.stdout.write(recordMarker.repeat(newRecords.length))
  41. records.push(...newRecords)
  42. })
  43. .on('error', (err) => {
  44. console.error(err)
  45. })
  46. return eventToPromise(waiter, 'done').then(() => {
  47. console.log('\nrecords in index: ', records.length)
  48. return records
  49. })
  50. }
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...