X-Git-Url: http://gitweb.fperrin.net/?p=gen-quickdic.git;a=blobdiff_plain;f=XMLittre_to_tab_separated;fp=XMLittre_to_tab_separated;h=71a42bab4465ca43141df3b347b8bdc8a2494603;hp=0c041d99e76727ce033511953eae791458a13cb3;hb=e701d98cb3befe8f5af017850c0575c0382a853c;hpb=028a6accf7d7d721797482d65dd4dc9d13611840
diff --git a/XMLittre_to_tab_separated b/XMLittre_to_tab_separated
index 0c041d9..71a42ba 100755
--- a/XMLittre_to_tab_separated
+++ b/XMLittre_to_tab_separated
@@ -2,8 +2,25 @@
from glob import glob
from lxml import etree
+import re
+from copy import deepcopy
-xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt"))
+class MassageLink(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ link = etree.Element("a")
+
+ ref = input_node.attrib["ref"]
+ term = re.match("^[^#. ]+", ref).group(0)
+ if input_node.text == "ce mot":
+ link.text = term.lower()
+ else:
+ link.text = input_node.text.lower()
+ link.attrib["style"] = "font-variant: small-caps"
+ link.attrib["href"] = f"q://d?&{term.upper()}"
+ output_parent.append(link)
+
+xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt"),
+ extensions={("xmlittre", "massagelink"): MassageLink()})
class Parser:
def __init__(self):
@@ -15,32 +32,82 @@ class Parser:
for entree in root.getchildren():
assert entree.tag == "entree"
terme = entree.attrib["terme"]
+ termes, sous_titre = massage_terme(terme)
+
entree_html = xslt_entree(entree)
entree_text = str(entree_html)
- entree_text = entree_text.replace("\n", "")
- if terme not in self.entrees:
- self.entrees[terme] = []
- self.entrees[terme].append(entree_text)
+ # entree_text = entree_text.replace("\n", "")
+ entree_text = re.sub(r"\s+|\n", " ", entree_text)
+ assert entree_text.startswith("
") and entree_text.endswith("
")
+ entree_text = entree_text[5:]
+ entree_text = entree_text[:-6]
+
+ if termes not in self.entrees:
+ self.entrees[termes] = []
+
+ self.entrees[termes].append((sous_titre, entree_text))
def writeout(self, fname):
with open(fname, "w") as f:
- for terme in self.entrees:
- f.write(terme)
+ for termes in self.entrees:
+ f.write(", ".join(termes))
f.write("\t")
- if len(self.entrees[terme]) > 1:
- f.write("")
- for entree in self.entrees[terme]:
- f.write("- ")
+ if len(self.entrees[termes]) > 1:
+ for i, (sous_titre, entree) in enumerate(self.entrees[termes]):
+ if sous_titre:
+ f.write(f"
{i+1}. {sous_titre}
")
+ else:
+ f.write(f"{i+1}. {','.join(termes)}
")
f.write(entree)
- f.write(" ")
- f.write("
")
else:
- f.write(self.entrees[terme][0])
+ sous_titre, entree = self.entrees[termes][0]
+ if sous_titre:
+ f.write(f"{sous_titre}
")
+ f.write(entree)
f.write("\n")
+def massage_terme(terme):
+ vedettes = re.split(r"[ ,](?:[A-Z]?[a-zéêèà ç-]+[ ,']*)+", terme)
+ vedettes = list(filter(None, vedettes))
+ sous_terme = None
+ if len(vedettes) > 1:
+ sous_terme = terme
+ retval = []
+ for v in vedettes:
+ m = re.match(r"^(.*) \((.*)\)$", v)
+ if m:
+ if m.group(2)[-1] == "'":
+ space = ""
+ else:
+ space = " "
+ v = f"{m.group(2)}{space}{m.group(1)}"
+ if sous_terme:
+ sous_terme = sous_terme.replace(m.group(0), v)
+ formes = v.split(",")
+ if len(formes) > 1:
+ v = formes[0]
+ sous_terme = terme
+ retval.append(v)
+ return (tuple(retval), sous_terme)
+
+def test_massage_terme():
+ assert massage_terme("ÃTANCHE") == (("ÃTANCHE",), None)
+ assert massage_terme("ÃREINTÃ, ÃE") == (("ÃREINTÃ",), "ÃREINTÃ, ÃE")
+ assert massage_terme("CLÃ ou CLEF") == (("CLÃ", "CLEF"), "CLÃ ou CLEF")
+ assert massage_terme("HÃMORRAGIE, suivant le dictionnaire de l'Académie, mais mieux HÃMORRHAGIE") == \
+ (("HÃMORRAGIE", "HÃMORRHAGIE"), "HÃMORRAGIE, suivant le dictionnaire de l'Académie, mais mieux HÃMORRHAGIE")
+ assert massage_terme("QUINDÃCEMVIRS ou mieux QUINDÃCIMVIRS puisque le latin est quindecimviri") \
+ == (("QUINDÃCEMVIRS", "QUINDÃCIMVIRS"), "QUINDÃCEMVIRS ou mieux QUINDÃCIMVIRS puisque le latin est quindecimviri")
+ assert massage_terme("DÃVOUEMENT, ou comme quelques-uns écrivent, dit l'Académie, DÃVOÃMENT") == (("DÃVOUEMENT", "DÃVOÃMENT"), "DÃVOUEMENT, ou comme quelques-uns écrivent, dit l'Académie, DÃVOÃMENT")
+ assert massage_terme("HÃMA- ou HÃMO- ou HÃMATO-") == (("HÃMA-", "HÃMO-", "HÃMATO-"), "HÃMA- ou HÃMO- ou HÃMATO-")
+ assert massage_terme("DIVINIS (A)") == (("A DIVINIS",), None)
+ assert massage_terme("BORDEAUX (VIN DE) ou vulgairement et en termes de commerce BORDEAUX") == \
+ (("VIN DE BORDEAUX", "BORDEAUX"), "VIN DE BORDEAUX ou vulgairement et en termes de commerce BORDEAUX")
+
def main():
p = Parser()
- for fname in glob("../xmlittre-data/?.xml"):
+ for fname in sorted(glob("../xmlittre-data/?.xml")):
+ print(f"Running on {fname}")
p.parse_file(fname)
p.writeout("XMLittre.tab_separated")