X-Git-Url: https://gitweb.fperrin.net/?a=blobdiff_plain;f=XMLittre_to_tab_separated;fp=XMLittre_to_tab_separated;h=0c041d99e76727ce033511953eae791458a13cb3;hb=028a6accf7d7d721797482d65dd4dc9d13611840;hp=0000000000000000000000000000000000000000;hpb=5da87a9ec2370bb2f7ce11e107f07625e42f7171;p=gen-quickdic.git diff --git a/XMLittre_to_tab_separated b/XMLittre_to_tab_separated new file mode 100755 index 0000000..0c041d9 --- /dev/null +++ b/XMLittre_to_tab_separated @@ -0,0 +1,48 @@ +#!/usr/bin/python3 + +from glob import glob +from lxml import etree + +xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt")) + +class Parser: + def __init__(self): + self.entrees = {} + + def parse_file(self, fname): + fxml = etree.parse(fname) + root = fxml.getroot() + for entree in root.getchildren(): + assert entree.tag == "entree" + terme = entree.attrib["terme"] + entree_html = xslt_entree(entree) + entree_text = str(entree_html) + entree_text = entree_text.replace("\n", "") + if terme not in self.entrees: + self.entrees[terme] = [] + self.entrees[terme].append(entree_text) + + def writeout(self, fname): + with open(fname, "w") as f: + for terme in self.entrees: + f.write(terme) + f.write("\t") + if len(self.entrees[terme]) > 1: + f.write("
    ") + for entree in self.entrees[terme]: + f.write("
  1. ") + f.write(entree) + f.write("
  2. ") + f.write("
") + else: + f.write(self.entrees[terme][0]) + f.write("\n") + +def main(): + p = Parser() + for fname in glob("../xmlittre-data/?.xml"): + p.parse_file(fname) + p.writeout("XMLittre.tab_separated") + +if __name__ == "__main__": + main()