]> gitweb.fperrin.net Git - gen-quickdic.git/blobdiff - XMLittre_to_tab_separated
snapshot of current effort
[gen-quickdic.git] / XMLittre_to_tab_separated
diff --git a/XMLittre_to_tab_separated b/XMLittre_to_tab_separated
new file mode 100755 (executable)
index 0000000..0c041d9
--- /dev/null
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+
+from glob import glob
+from lxml import etree
+
+xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt"))
+
+class Parser:
+    def __init__(self):
+        self.entrees = {}
+
+    def parse_file(self, fname):
+        fxml = etree.parse(fname)
+        root = fxml.getroot()
+        for entree in root.getchildren():
+            assert entree.tag == "entree"
+            terme = entree.attrib["terme"]
+            entree_html = xslt_entree(entree)
+            entree_text = str(entree_html)
+            entree_text = entree_text.replace("\n", "")
+            if terme not in self.entrees:
+                self.entrees[terme] = []
+            self.entrees[terme].append(entree_text)
+
+    def writeout(self, fname):
+        with open(fname, "w") as f:
+            for terme in self.entrees:
+                f.write(terme)
+                f.write("\t")
+                if len(self.entrees[terme]) > 1:
+                    f.write("<ol>")
+                    for entree in self.entrees[terme]:
+                        f.write("<li>")
+                        f.write(entree)
+                        f.write("</li>")
+                    f.write("</ol>")
+                else:
+                    f.write(self.entrees[terme][0])
+                f.write("\n")
+
+def main():
+    p = Parser()
+    for fname in glob("../xmlittre-data/?.xml"):
+        p.parse_file(fname)
+    p.writeout("XMLittre.tab_separated")
+
+if __name__ == "__main__":
+    main()