]> gitweb.fperrin.net Git - gen-quickdic.git/blob - XMLittre_to_tab_separated
snapshot of current effort
[gen-quickdic.git] / XMLittre_to_tab_separated
1 #!/usr/bin/python3
2
3 from glob import glob
4 from lxml import etree
5
6 xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt"))
7
8 class Parser:
9     def __init__(self):
10         self.entrees = {}
11
12     def parse_file(self, fname):
13         fxml = etree.parse(fname)
14         root = fxml.getroot()
15         for entree in root.getchildren():
16             assert entree.tag == "entree"
17             terme = entree.attrib["terme"]
18             entree_html = xslt_entree(entree)
19             entree_text = str(entree_html)
20             entree_text = entree_text.replace("\n", "")
21             if terme not in self.entrees:
22                 self.entrees[terme] = []
23             self.entrees[terme].append(entree_text)
24
25     def writeout(self, fname):
26         with open(fname, "w") as f:
27             for terme in self.entrees:
28                 f.write(terme)
29                 f.write("\t")
30                 if len(self.entrees[terme]) > 1:
31                     f.write("<ol>")
32                     for entree in self.entrees[terme]:
33                         f.write("<li>")
34                         f.write(entree)
35                         f.write("</li>")
36                     f.write("</ol>")
37                 else:
38                     f.write(self.entrees[terme][0])
39                 f.write("\n")
40
41 def main():
42     p = Parser()
43     for fname in glob("../xmlittre-data/?.xml"):
44         p.parse_file(fname)
45     p.writeout("XMLittre.tab_separated")
46
47 if __name__ == "__main__":
48     main()