1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
#!/usr/bin/python3
# Copyright 2019 Gentoo Authors
# Distributed under the terms of the GNU GPL version 2 or later
import json
import os.path
import sys
import xml.etree.ElementTree as ET
def stringify_node(parent: ET.Element) -> str:
"""Flatten this node and its immediate children to a string.
Combine the text and tail of this node, and any of its immediate
children, if there are any, into a flat string. The tag <d/> is a
special case that resolves to the dash ('-') character.
Keyword arguments:
parent -- the node to convert to a string
"""
# We usually have something like:
# <p>\nText
# Left strip the whitespace.
if parent.text:
text = parent.text.lstrip()
else:
text = str()
# For each child, strip the tags and append to text
# along with the tail text following it.
# The tail may include '\n' if it spans multiple lines.
# We will worry about those on return, not now.
for child in parent:
# The '<d/>' tag is simply a fancier '-' character
if child.tag == 'd':
text += '-'
if child.text:
text += child.text
if child.tail:
text += child.tail
# A paragraph typically ends with:
# Text\n</p>
# Right strip any spurious whitespace.
# Finally, get rid of any intermediate newlines.
return text.rstrip().replace('\n', ' ')
def process_node(documents: list, node: ET.Element, name: str, url: str) -> None:
"""Recursively process a given node and its children based on tag values.
For the top level node <chapter>, extract the title and recurse
down to the children.
For the intermediary nodes with titles, such as <section>, update
the search result title and url, and recurse down.
For the terminal nodes, such as <p>, convert the contents of the
node to a string, and add it to the search documents.
Keyword arguments:
documents -- the search documents array
node -- the node to process
name -- the title to display for the search term match
url -- the url for the search term match in the document
"""
if node.tag == 'chapter':
name = stringify_node(node.find('title'))
for child in node:
process_node(documents, child, name, url)
elif node.tag in ['section', 'subsection', 'subsubsection']:
title = stringify_node(node.find('title'))
name += ' -> ' + title
url = "{url_base}#{anchor}".format(
url_base=url.split('#')[0],
anchor=title.lower().replace(' ', '-'))
for child in node:
process_node(documents, child, name, url)
elif node.tag in ['body', 'guide']:
for child in node:
process_node(documents, child, name, url)
elif node.tag in ['p', 'important', 'note', 'warning']:
text = stringify_node(node)
documents.append({'id': len(documents),
'name': name,
'text': text,
'url': url})
else:
pass
def main(pathnames: list) -> None:
"""The entry point of the script.
Keyword arguments:
pathnames -- a list of path names to process in sequential order
"""
url_root = 'https://devmanual.gentoo.org/'
documents = []
for path in pathnames:
tree = ET.parse(path)
root = tree.getroot()
try:
url = url_root + os.path.dirname(path) + '/'
process_node(documents, root, None, url)
except:
raise
print('var documents = ' + json.dumps(documents) + ';')
if __name__ in '__main__':
main(sys.argv[1:])
|