ommlparser.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. from xml.etree.cElementTree import Element
  2. from .utils import qn
  3. class OMMLParser:
  4. """
  5. Parser class for reading OMML and converting it into LaTeX.
  6. """
  7. FUNCTION_MAP = {
  8. "sin": "\\sin",
  9. "cos": "\\cos",
  10. "tan": "\\tan",
  11. "cot": "\\cot",
  12. "sec": "\\sec",
  13. "csc": "\\csc",
  14. "sinh": "\\sinh",
  15. "cosh": "\\cosh",
  16. "tanh": "\\tanh",
  17. "coth": "\\coth",
  18. "sech": "\\operatorname{sech}",
  19. "csch": "\\operatorname{csch}",
  20. "log": "\\log",
  21. "ln": "\\ln",
  22. "min": "\\min",
  23. "max": "\\max",
  24. "lim": "\\lim",
  25. }
  26. def _normalize_func_name(self, content: str) -> str:
  27. if not content:
  28. return content
  29. if content.startswith("\\"):
  30. return content
  31. key = content.strip()
  32. mapped = self.FUNCTION_MAP.get(key)
  33. return mapped if mapped else content
  34. def parse(self, root: Element) -> str:
  35. """
  36. Parses an m:oMath OMML tag into LaTeX.
  37. :param root: An m:oMath OMML tag
  38. :return: The LaTeX representation of the OMML input
  39. """
  40. text = ""
  41. try:
  42. if root.tag == qn("m:t"):
  43. return self.parse_t(root)
  44. for child in root:
  45. if child.tag in self.parsers:
  46. text += self.parsers[child.tag](self, child)
  47. except AttributeError:
  48. # In case of missing attributes on OMML tags,
  49. # we return an empty string (ref:issue_14)
  50. return ""
  51. return text
  52. def parse_e(self, root: Element) -> str:
  53. text = ""
  54. for child in root:
  55. text += self.parse(child)
  56. return text
  57. def parse_r(self, root: Element) -> str:
  58. # TODO: Add support for m:rPr and m:scr to support different character styles
  59. # For now, we just parse the text content of m:r
  60. text = ""
  61. for child in root:
  62. text += self.parse(child)
  63. return text
  64. def parse_t(self, root: Element):
  65. symbol_map = {
  66. "≜": "\\triangleq",
  67. "≝": "\\stackrel{\\tiny def}{=}",
  68. "≞": "\\stackrel{\\tiny m}{=}",
  69. }
  70. replacements = {
  71. "<": "\\lt ",
  72. ">": "\\gt ",
  73. "≤": "\\leq ",
  74. "≥": "\\geq ",
  75. "∞": "\\infty ",
  76. "<": "\\lt ",
  77. ">": "\\gt ",
  78. "≤": "\\leq ",
  79. "≥": "\\geq ",
  80. }
  81. text = root.text.split()
  82. if not text:
  83. return " "
  84. for i, t in enumerate(text):
  85. if t in symbol_map:
  86. text[i] = symbol_map[t]
  87. for key, value in replacements.items():
  88. for i, t in enumerate(text):
  89. text[i] = t.replace(key, value)
  90. return " ".join(text)
  91. def parse_acc(self, root: Element) -> str:
  92. character_map = {
  93. 768: "\\grave",
  94. 769: "\\acute",
  95. 770: "\\hat",
  96. 771: "\\tilde",
  97. 773: "\\bar",
  98. 774: "\\breve",
  99. 775: "\\dot",
  100. 776: "\\ddot",
  101. 780: "\\check",
  102. 831: "\\overline{\\overline",
  103. 8400: "\\overset\\leftharpoonup",
  104. 8401: "\\overset\\rightharpoonup",
  105. 8406: "\\overleftarrow",
  106. 8407: "\\overrightarrow",
  107. 8411: "\\dddot",
  108. 8417: "\\overset\\leftrightarrow",
  109. }
  110. text = ""
  111. accent = 770
  112. for child in root:
  113. if child.tag == qn("m:accPr"):
  114. for child2 in child:
  115. if child2.tag == qn("m:chr"):
  116. val = child2.attrib.get(qn("m:val"))
  117. if val:
  118. try:
  119. accent = ord(val)
  120. except TypeError:
  121. pass
  122. accent_cmd = character_map.get(accent)
  123. if accent_cmd is None:
  124. accent_cmd = character_map.get(770, "\\hat")
  125. text += accent_cmd + "{"
  126. for child in root:
  127. if child.tag == qn("m:e"):
  128. text += self.parse(child)
  129. text += "}"
  130. if accent == 831:
  131. text += "}"
  132. return text
  133. def parse_bar(self, root: Element) -> str:
  134. text = "\\overline{"
  135. for child in root:
  136. if child.tag == qn("m:barPr"):
  137. for child2 in child:
  138. if child2.tag == qn("m:pos"):
  139. if child2.attrib.get(qn("m:val")) == "bot":
  140. text = "\\underline{"
  141. for child in root:
  142. if child.tag == qn("m:e"):
  143. text += self.parse(child)
  144. text += "}"
  145. return text
  146. def parse_border_box(self, root: Element) -> str:
  147. text = "\\boxed{"
  148. for child in root:
  149. if child.tag == qn("m:e"):
  150. text += self.parse(child)
  151. text += "}"
  152. return text
  153. def parse_box(self, root: Element) -> str:
  154. text = ""
  155. for child in root:
  156. text += self.parse(child)
  157. return text
  158. def parse_group_chr(self, root: Element) -> str:
  159. character_map = {
  160. "←": "\\leftarrow",
  161. "→": "\\rightarrow",
  162. "↔": "\\leftrightarrow",
  163. "⇐": "\\Leftarrow",
  164. "⇒": "\\Rightarrow",
  165. "⇔": "\\Leftrightarrow",
  166. }
  167. text = "\\underbrace{"
  168. bottom = False
  169. for child in root:
  170. if child.tag == qn("m:groupChrPr"):
  171. for child2 in child:
  172. if child2.tag == qn("m:chr"):
  173. char = child2.attrib.get(qn("m:val"))
  174. if char in character_map:
  175. text = character_map[char]
  176. for child2 in child:
  177. if (
  178. child2.tag == qn("m:pos")
  179. and child2.attrib.get(qn("m:val")) == "top"
  180. ):
  181. # If m:pos is set to "top", the symbol is supposed to
  182. # be on top and the text is actually supposed to be under
  183. bottom = True
  184. content = ""
  185. for child in root:
  186. if child.tag == qn("m:e"):
  187. content = self.parse(child)
  188. if text == "\\underbrace{":
  189. if bottom:
  190. text = "\\overbrace{" + content + "}"
  191. else:
  192. text += content + "}"
  193. else:
  194. if not bottom:
  195. text = "\\overset{" + content + "}" + "{" + text + "}"
  196. else:
  197. text = "\\underset{" + content + "}" + "{" + text + "}"
  198. return text
  199. def parse_d(self, root: Element) -> str:
  200. bracket_map = {
  201. "(": "\\left(",
  202. ")": "\\right)",
  203. "[": "\\left[",
  204. "]": "\\right]",
  205. "{": "\\left{",
  206. "}": "\\right}",
  207. "〈": "\\left\\langle",
  208. "〉": "\\right\\rangle",
  209. "⟨": "\\left\\langle",
  210. "⟩": "\\right\\rangle",
  211. "⌊": "\\left\\lfloor",
  212. "⌋": "\\right\\rfloor",
  213. "⌈": "\\left\\lceil",
  214. "⌉": "\\right\\rceil",
  215. "|": "\\left|",
  216. "‖": "\\left\\|",
  217. "⟦": "[\\![",
  218. "⟧": "]\\!]",
  219. }
  220. text = ""
  221. start_bracket = "("
  222. end_bracket = ")"
  223. seperator = "|"
  224. is_matrix = False
  225. for child in root:
  226. for child2 in child:
  227. if child.tag == qn("m:dPr"):
  228. if child2.tag == qn("m:begChr"):
  229. start_bracket = child2.attrib.get(qn("m:val"))
  230. if child2.tag == qn("m:endChr"):
  231. end_bracket = child2.attrib.get(qn("m:val"))
  232. if child2.tag == qn("m:sepChr"):
  233. seperator = child2.attrib.get(qn("m:val"))
  234. if child2.tag == qn("m:m"):
  235. is_matrix = True
  236. for child in root:
  237. if child.tag == qn("m:e"):
  238. if text:
  239. text += seperator
  240. text += self.parse(child)
  241. end_bracket_replacements = {
  242. "|": "\\right|",
  243. "‖": "\\right\\|",
  244. "[": "\\right[",
  245. }
  246. start_bracket_replacements = {
  247. "]": "\\left]",
  248. }
  249. start = ""
  250. end = ""
  251. if start_bracket:
  252. if start_bracket in start_bracket_replacements:
  253. start = start_bracket_replacements[start_bracket] + " "
  254. elif start_bracket in bracket_map:
  255. start = bracket_map[start_bracket] + " "
  256. else:
  257. start = "\\left(" + " "
  258. if end_bracket:
  259. if end_bracket in end_bracket_replacements:
  260. end = " " + end_bracket_replacements[end_bracket]
  261. elif end_bracket in bracket_map:
  262. end = " " + bracket_map[end_bracket]
  263. else:
  264. end = " " + "\\right)"
  265. # If there is no end bracket and this tag contains an m:eqArr tag as a
  266. # child, we assume that the eqArr should be translated to a cases environment
  267. # instead of an eqnarray* environment.
  268. else:
  269. for child in root:
  270. if child.tag == qn("m:e"):
  271. for child2 in child:
  272. if child2.tag == qn("m:eqArr"):
  273. text = text.replace("\\begin{eqnarray*}", "")
  274. text = text.replace("\\end{eqnarray*}", "")
  275. return "\\begin{cases} " + text + " \\end{cases}"
  276. if is_matrix:
  277. if start_bracket == "(" and end_bracket == ")":
  278. return text.replace("{matrix}", "{pmatrix}")
  279. elif start_bracket == "|" and end_bracket == "|":
  280. return text.replace("{matrix}", "{vmatrix}")
  281. elif start_bracket == "‖" and end_bracket == "‖":
  282. return text.replace("{matrix}", "{Vmatrix}")
  283. else:
  284. return text.replace("{matrix}", "{bmatrix}")
  285. return start + text + end
  286. def parse_eq_arr(self, root: Element) -> str:
  287. text = "\\begin{eqnarray*}"
  288. for child in root:
  289. if child.tag == qn("m:e"):
  290. text += self.parse(child) + " \\\\"
  291. text += "\\end{eqnarray*}"
  292. return text
  293. def parse_f(self, root: Element) -> str:
  294. text = "\\frac{"
  295. num = ""
  296. den = ""
  297. is_binom = False
  298. for child in root:
  299. if child.tag == qn("m:fPr"):
  300. for child2 in child:
  301. if (
  302. child2.tag == qn("m:type")
  303. and child2.attrib.get(qn("m:val")) == "noBar"
  304. ):
  305. is_binom = True
  306. if child.tag == qn("m:num"):
  307. num = self.parse(child)
  308. if child.tag == qn("m:den"):
  309. den = self.parse(child)
  310. if is_binom:
  311. text = "\\genfrac{}{}{0pt}{}{"
  312. text += num + "}{" + den + "}"
  313. return text
  314. def parse_m(self, root: Element) -> str:
  315. text = "\\begin{matrix} "
  316. text += self.parse(root)[:-3] # Remove the last ' \\'
  317. text += "\\end{matrix}"
  318. return text
  319. def parse_mr(self, root: Element) -> str:
  320. text = ""
  321. for child in root:
  322. if child.tag == qn("m:e"):
  323. text += self.parse(child) + " & "
  324. return text[:-2] + "\\\\ " # Remove the last ' & '
  325. def parse_func(self, root: Element) -> str:
  326. subscript = ""
  327. superscript = ""
  328. text = ""
  329. func_name = "sin"
  330. for child in root:
  331. if child.tag == qn("m:fName"):
  332. for child2 in child:
  333. if child2.tag in [qn("m:sSup"), qn("m:sSub"), qn("m:r")]:
  334. for child3 in child2:
  335. if child3.tag == qn("m:sub"):
  336. subscript = self.parse(child3)
  337. if child3.tag == qn("m:sup"):
  338. superscript = self.parse(child3)
  339. if child3.tag == qn("m:t") or child3.tag == qn("m:e"):
  340. func_name = self.parse(child3)
  341. elif child2.tag == qn("m:limLow"):
  342. for child3 in child2:
  343. if child3.tag == qn("m:lim"):
  344. for child4 in child3:
  345. subscript += self.parse(child4)
  346. if child3.tag == qn("m:e"):
  347. func_name = self.parse(child3)
  348. if child.tag == qn("m:e"):
  349. text += self.parse(child)
  350. if func_name in ["lim", "max", "min"]:
  351. return f"\\{func_name}\\limits_{{{subscript}}}^{{{superscript}}}{{{text}}}"
  352. if func_name not in self.FUNCTION_MAP:
  353. return f"{{{func_name}}}^{{{superscript}}}_{{{subscript}}}{{{text}}}"
  354. return (
  355. self.FUNCTION_MAP[func_name]
  356. + f"_{{{subscript}}}^{{{superscript}}}{{{text}}}"
  357. )
  358. def parse_s_sup(self, root: Element) -> str:
  359. content = ""
  360. exp_content = ""
  361. for child in root:
  362. if child.tag == qn("m:e"):
  363. content = self.parse(child)
  364. if child.tag == qn("m:sup"):
  365. exp_content = self.parse(child)
  366. content = self._normalize_func_name(content)
  367. return f"{{{content}}}^{{{exp_content}}}"
  368. def parse_s_sub(self, root: Element) -> str:
  369. content = ""
  370. sub_content = ""
  371. for child in root:
  372. if child.tag == qn("m:e"):
  373. content = self.parse(child)
  374. if child.tag == qn("m:sub"):
  375. sub_content = self.parse(child)
  376. content = self._normalize_func_name(content)
  377. return f"{{{content}}}_{{{sub_content}}}"
  378. def parse_s_sub_sup(self, root: Element) -> str:
  379. content = ""
  380. sub_content = ""
  381. exp_content = ""
  382. for child in root:
  383. if child.tag == qn("m:e"):
  384. content = self.parse(child)
  385. if child.tag == qn("m:sub"):
  386. sub_content = self.parse(child)
  387. if child.tag == qn("m:sup"):
  388. exp_content = self.parse(child)
  389. content = self._normalize_func_name(content)
  390. return f"{{{content}}}_{{{sub_content}}}^{{{exp_content}}}"
  391. def parse_s_pre(self, root: Element) -> str:
  392. content = ""
  393. sub_content = ""
  394. exp_content = ""
  395. for child in root:
  396. if child.tag == qn("m:e"):
  397. content = self.parse(child)
  398. if child.tag == qn("m:sub"):
  399. sub_content = self.parse(child)
  400. if child.tag == qn("m:sup"):
  401. exp_content = self.parse(child)
  402. return "{}^{" + exp_content + "}_{" + sub_content + "}{" + content + "}"
  403. def parse_rad(self, root: Element) -> str:
  404. content = ""
  405. order = ""
  406. for child in root:
  407. if child.tag == qn("m:deg"):
  408. order = self.parse(child)
  409. if child.tag == qn("m:e"):
  410. content += self.parse(child)
  411. if order:
  412. return f"\\sqrt[{order}]{{{content}}}"
  413. return f"\\sqrt{{{content}}}"
  414. def parse_nary(self, root: Element) -> str:
  415. character_map = {
  416. 8719: "\\prod",
  417. 8720: "\\coprod",
  418. 8721: "\\sum",
  419. 8747: "\\int",
  420. 8748: "\\iint",
  421. 8749: "\\iiint",
  422. 8750: "\\oint",
  423. 8751: "\\oiint",
  424. 8752: "\\oiiint",
  425. 8896: "\\bigwedge",
  426. 8897: "\\bigvee",
  427. 8898: "\\bigcap",
  428. 8899: "\\bigcup",
  429. }
  430. char = 8747
  431. for child in root:
  432. if child.tag == qn("m:naryPr"):
  433. for child2 in child:
  434. if child2.tag == qn("m:chr"):
  435. val = child2.attrib.get(qn("m:val"))
  436. if val:
  437. try:
  438. char = ord(val)
  439. except TypeError:
  440. pass
  441. text = character_map.get(char, character_map[8721])
  442. sub = ""
  443. sup = ""
  444. content = ""
  445. for child in root:
  446. if child.tag == qn("m:sub"):
  447. sub = self.parse(child)
  448. if child.tag == qn("m:sup"):
  449. sup = self.parse(child)
  450. if child.tag == qn("m:e"):
  451. content = self.parse(child)
  452. if sub:
  453. text += f"_{{{sub}}}"
  454. if sup:
  455. text += f"^{{{sup}}}"
  456. text += "{" + content + "}"
  457. return text
  458. parsers = {
  459. qn("m:r"): parse_r,
  460. qn("m:acc"): parse_acc,
  461. qn("m:borderBox"): parse_border_box,
  462. qn("m:bar"): parse_bar,
  463. qn("m:box"): parse_box,
  464. qn("m:d"): parse_d,
  465. qn("m:e"): parse_e,
  466. qn("m:groupChr"): parse_group_chr,
  467. qn("m:f"): parse_f,
  468. qn("m:sSup"): parse_s_sup,
  469. qn("m:sSub"): parse_s_sub,
  470. qn("m:sSubSup"): parse_s_sub_sup,
  471. qn("m:sPre"): parse_s_pre,
  472. qn("m:t"): parse_t,
  473. qn("m:rad"): parse_rad,
  474. qn("m:nary"): parse_nary,
  475. qn("m:eqArr"): parse_eq_arr,
  476. qn("m:func"): parse_func,
  477. qn("m:m"): parse_m,
  478. qn("m:mr"): parse_mr,
  479. }