latex equation in markdown support added

This commit is contained in:
Nicolas 2025-04-14 00:19:09 +02:00
parent 08a6bea78d
commit c84dcf6642
2 changed files with 174 additions and 102 deletions

View File

@ -6,7 +6,8 @@ import re
import io import io
from . import latex from . import latex
import pkgutil import pkgutil
from html.parser import HTMLParser
from io import StringIO
HTML_OUTPUT = 0 HTML_OUTPUT = 0
LATEX_OUTPUT = 1 LATEX_OUTPUT = 1
@ -53,23 +54,8 @@ def _get_pkgutil_string(path: str) -> str:
def _markdown_to_html(text: str) -> str: def _markdown_to_html(text: str) -> str:
prep_text = re.sub(r'\u00A0', ' ', text) # non-breaking space prep_text = re.sub(r'\u00A0', ' ', text) # non-breaking space
html = markdown.markdown(prep_text, extensions=['tables', 'fenced_code', 'def_list', 'abbr', 'sane_lists']) html_text = markdown.markdown(prep_text, extensions=['tables', 'fenced_code', 'def_list', 'abbr', 'sane_lists'])
return html.replace('<hr />', '<hr>') return html_text
def escape_html(text: str) -> str:
"""
Escapes special HTML characters in a given string.
Args:
text: The text to escape
Returns:
Escaped text save for inserting into HTML code
"""
ret = re.sub(r'\u00A0', '&nbsp;', text) # non-breaking space
ret = html.escape(ret)
return ' '.join(ret.strip().splitlines())
def _clean_svg(svg_text: str) -> str: def _clean_svg(svg_text: str) -> str:
@ -140,6 +126,21 @@ def _save_figure(fig: Figure, buff: io.BytesIO, figure_format: FFormat, font_fam
fig.set_size_inches(old_size, None, False) fig.set_size_inches(old_size, None, False)
def escape_html(text: str) -> str:
"""
Escapes special HTML characters in a given string.
Args:
text: The text to escape
Returns:
Escaped text save for inserting into HTML code
"""
ret = re.sub(r'\u00A0', '&nbsp;', text) # non-breaking space
ret = html.escape(ret)
return ' '.join(ret.strip().splitlines())
def figure_to_string(fig: Figure, def figure_to_string(fig: Figure,
figure_format: FFormat = 'svg', figure_format: FFormat = 'svg',
font_family: str | None = None, font_family: str | None = None,
@ -305,6 +306,105 @@ class DocumentWriter():
self._item_count[ref_type] = current_index self._item_count[ref_type] = current_index
return caption_prefix.format(current_index) return caption_prefix.format(current_index)
def _equation_embedding_reescaping(self, text: str) -> str:
"""
Convert $$-escaping of LaTeX blocks and inline expressions
to a HTML-style format: <latex>...</latex>.
"""
block_pattern = re.compile(
r'(^|\n)\s*\$\$\s*\n' # start delimiter on a line on its own
r'(?P<content>.*?)' # capture block content non-greedily
r'\n\s*\$\$\s*(\n|$)', # end delimiter on a line on its own
re.DOTALL | re.MULTILINE
)
def block_repl(match: re.Match[str]) -> str:
content = match.group("content").strip()
latex_label: str = ''
label_pattern = re.compile(r'^\\label\{([^}]+)\}\s*\n?')
label_match = label_pattern.match(content)
if label_match:
latex_label = label_match.group(1)
# Remove the label command from the content.
content = content[label_match.end():].lstrip()
if latex_label and ':' in latex_label:
parts = latex_label.split(':')
ref_type = parts[0]
ref_id = parts[1]
caption = self._add_item(ref_id, ref_type, '({})')
return (f'\n<latex type="block" ref_type="{ref_type}"'
f' ref_id="{ref_id}" caption="{caption}">{content}</latex>\n')
else:
return f'\n<latex type="block">{content}</latex>\n'
result = block_pattern.sub(block_repl, text)
inline_pattern = re.compile(r'\$\$(.+?)\$\$')
def inline_repl(match: re.Match[str]) -> str:
content = match.group(1)
return f'<latex>{content}</latex>'
return inline_pattern.sub(inline_repl, result)
def _get_equation_html(self, latex_equation: str, caption: str, block: bool = False) -> str:
fig = latex_to_figure(latex_equation)
if block:
ret = ('<div class="equation-container">'
'<div class="equation">%s</div>'
'<div class="equation-number">%s</div></div>') % (
figure_to_string(fig, self._figure_format, base64=self._base64_svgs),
caption)
else:
ret = '<span class="inline-equation">' + figure_to_string(fig, self._figure_format, base64=self._base64_svgs) + '</span>'
plt.close(fig)
return ret
def _html_post_processing(self, html_code: str) -> str:
"""
"""
class HTMLPostProcessor(HTMLParser):
def __init__(self, document_writer: 'DocumentWriter') -> None:
super().__init__()
self.modified_html = StringIO()
self.in_latex: bool = False
self.eq_caption: str = ''
self.block: bool = False
self.dw = document_writer
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag == 'hr':
self.modified_html.write(f"<{tag}>")
elif tag == 'latex':
self.in_latex = True
attr_dict = {k: v if v else '' for k, v in attrs}
self.eq_caption = attr_dict.get('caption', '')
self.block = attr_dict.get('type') == 'block'
elif not self.in_latex:
tag_text = self.get_starttag_text()
if tag_text:
self.modified_html.write(tag_text)
def handle_data(self, data: str) -> None:
if self.in_latex:
self.modified_html.write(
self.dw._get_equation_html(data, self.eq_caption, self.block))
else:
self.modified_html.write(data)
def handle_endtag(self, tag: str) -> None:
if tag == 'latex':
self.in_latex = False
else:
self.modified_html.write(f"</{tag}>")
parser = HTMLPostProcessor(self)
parser.feed(html_code)
return parser.modified_html.getvalue()
def new_field(self, name: str) -> 'DocumentWriter': def new_field(self, name: str) -> 'DocumentWriter':
new_dwr = _create_document_writer() new_dwr = _create_document_writer()
self._fields[name] = new_dwr self._fields[name] = new_dwr
@ -318,7 +418,7 @@ class DocumentWriter():
centered: bool = True) -> None: centered: bool = True) -> None:
""" """
Adds a diagram to the document. Adds a diagram to the document.
Args: Args:
fig: The figure to add (matplotlib figure) fig: The figure to add (matplotlib figure)
caption: The caption for the figure caption: The caption for the figure
@ -329,14 +429,15 @@ class DocumentWriter():
has an individual numbering has an individual numbering
centered: Whether to center the figure in LaTeX output centered: Whether to center the figure in LaTeX output
""" """
caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
def render_to_html() -> str: def render_to_html() -> str:
caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
return '<div class="figure">%s%s</div>' % ( return '<div class="figure">%s%s</div>' % (
figure_to_string(fig, self._figure_format, base64=self._base64_svgs, scale=self._fig_scale), figure_to_string(fig, self._figure_format, base64=self._base64_svgs, scale=self._fig_scale),
'<br>' + caption_prefix + escape_html(caption) if caption else '') '<br>' + caption_prefix + escape_html(caption) if caption else '')
def render_to_latex() -> str: def render_to_latex() -> str:
self._add_item(ref_id, ref_type, prefix_pattern)
return '\\begin{figure}%s\n%s\n\\caption{%s}\n%s\\end{figure}' % ( return '\\begin{figure}%s\n%s\n\\caption{%s}\n%s\\end{figure}' % (
'\n\\centering' if centered else '', '\n\\centering' if centered else '',
figure_to_string(fig, 'pgf', self._font_family, scale=self._fig_scale), figure_to_string(fig, 'pgf', self._font_family, scale=self._fig_scale),
@ -361,25 +462,27 @@ class DocumentWriter():
centered: Whether to center the table in LaTeX output centered: Whether to center the table in LaTeX output
""" """
assert Table and isinstance(table, Table), 'Table has to be a pandas DataFrame oder DataFrame Styler' assert Table and isinstance(table, Table), 'Table has to be a pandas DataFrame oder DataFrame Styler'
caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
styler = table if isinstance(table, Styler) else getattr(table, 'style', None) styler = table if isinstance(table, Styler) else getattr(table, 'style', None)
assert isinstance(styler, Styler), 'Jinja2 package is required for rendering tables' assert isinstance(styler, Styler), 'Jinja2 package is required for rendering tables'
def render_to_html() -> str: def render_to_html() -> str:
caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
html_string = styler.to_html(table_uuid=ref_id, caption=caption_prefix + escape_html(caption)) html_string = styler.to_html(table_uuid=ref_id, caption=caption_prefix + escape_html(caption))
return re.sub(r'<style.*?>.*?</style>', '', html_string, flags=re.DOTALL) return re.sub(r'<style.*?>.*?</style>', '', html_string, flags=re.DOTALL)
def render_to_latex() -> str: def render_to_latex() -> str:
self._add_item(ref_id, ref_type, prefix_pattern)
ref_label = latex.normalize_label_text(ref_type + ':' + ref_id)
if self._table_renderer == 'pandas': if self._table_renderer == 'pandas':
return styler.to_latex( return styler.to_latex(
label=latex.normalize_label_text(ref_type + ':' + ref_id), label=ref_label,
hrules=True, hrules=True,
convert_css=True, convert_css=True,
siunitx=True, siunitx=True,
caption=latex.escape_text(caption), caption=latex.escape_text(caption),
position_float='centering' if centered else None) position_float='centering' if centered else None)
else: else:
return latex.render_pandas_styler_table(styler, caption, ref_type + ':' + ref_id, centered) return latex.render_pandas_styler_table(styler, caption, ref_label, centered)
self._doc.append([render_to_html, render_to_latex]) self._doc.append([render_to_html, render_to_latex])
@ -476,21 +579,14 @@ class DocumentWriter():
ref_id: If provided, the equation is displayed with ref_id: If provided, the equation is displayed with
a number and can be referenced by the ref_id a number and can be referenced by the ref_id
""" """
caption = self._add_item(ref_id, ref_type, '({})')
def render_to_html() -> str: def render_to_html() -> str:
fig = latex_to_figure(latex_equation) caption = self._add_item(ref_id, ref_type, '({})')
return ('<div class="equation-container"><div class="equation">%s</div>' return self._get_equation_html(latex_equation, caption)
'<div class="equation-number">%s</div></div>') % (
figure_to_string(fig, self._figure_format, base64=self._base64_svgs),
caption)
def render_to_latex() -> str: def render_to_latex() -> str:
if ref_id: self._add_item(ref_id, ref_type, '')
return '\\begin{equation}\\label{%s:%s}%s\\end{equation}' % ( return latex.get_equation_code(latex_equation, ref_type, ref_id)
ref_type, ref_id, latex_equation)
else:
return '\\[%s\\]' % latex_equation
self._doc.append([render_to_html, render_to_latex]) self._doc.append([render_to_html, render_to_latex])
@ -505,14 +601,16 @@ class DocumentWriter():
norm_text = _normalize_text_indent(str(text)) norm_text = _normalize_text_indent(str(text))
def render_to_html() -> str: def render_to_html() -> str:
html = _markdown_to_html(norm_text) html = self._html_post_processing(_markdown_to_html(self._equation_embedding_reescaping(norm_text)))
if section_class: if section_class:
return '<div class="' + section_class + '">' + html + '</div>' return '<div class="' + section_class + '">' + html + '</div>'
else: else:
return html return html
def render_to_latex() -> str: def render_to_latex() -> str:
return latex.from_html(render_to_html()) html = _markdown_to_html(
self._equation_embedding_reescaping(norm_text))
return latex.from_html(html)
self._doc.append([render_to_html, render_to_latex]) self._doc.append([render_to_html, render_to_latex])

View File

@ -1,6 +1,5 @@
import bs4
from html.parser import HTMLParser from html.parser import HTMLParser
from typing import Iterator, Generator, Any from typing import Generator, Any
from pandas.io.formats.style import Styler from pandas.io.formats.style import Styler
import re import re
import os import os
@ -72,7 +71,7 @@ def escape_text(text: str) -> str:
for m in re.finditer(regex_filter, text): for m in re.finditer(regex_filter, text):
s1, s2 = m.span() s1, s2 = m.span()
ret.append(text[last_s:s1]) ret.append(text[last_s:s1])
matches = [v for k, v in LaTeX_translation.items() if re.match(k, m.group())] matches = [v for k, v in latex_translation.items() if re.match(k, m.group())]
if m.group(1): if m.group(1):
ret.append(matches[0].replace(r'\g<1>', normalize_label_text(m.group(1)))) ret.append(matches[0].replace(r'\g<1>', normalize_label_text(m.group(1))))
else: else:
@ -83,6 +82,25 @@ def escape_text(text: str) -> str:
return ''.join(ret) return ''.join(ret)
def get_equation_code(equation: str, ref_id: str, ref_type: str, block: bool = False) -> str:
"""
Converts an equation string to LaTeX code.
Args:
equation: The LaTeX equation string.
ref_id: The reference ID for the equation.
ref_type: The type of reference (e.g., 'eq', 'fig', etc.).
"""
if block:
if ref_id:
return '\\begin{equation}\\label{%s:%s}%s\\end{equation}' % (
normalize_label_text(ref_type), normalize_label_text(ref_id), equation)
else:
return '\\[%s\\]' % equation
else:
return '\\(%s\\)' % equation
def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str = '', centering: bool = True) -> str: def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str = '', centering: bool = True) -> str:
""" """
Converts a pandas Styler object to LaTeX table. Converts a pandas Styler object to LaTeX table.
@ -132,63 +150,6 @@ def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str =
return ''.join(str_list) return ''.join(str_list)
def from_html_old(html_code: str) -> str:
"""
Converts HTML code to LaTeX code.
Args:
html_code: The HTML code to convert.
Returns:
The LaTeX code.
"""
root = bs4.BeautifulSoup(html_code, 'html.parser')
html_to_latex = {
'strong': ('\\textbf{', '}'),
'b': ('\\textbf{', '}'),
'em': ('\\emph{', '}'),
'i': ('\\emph{', '}'),
'p': ('', '\n\n'),
'h1': ('\\section{', '}'),
'h2': ('\\subsection{', '}'),
'h3': ('\\subsubsection{', '}'),
'ul': ('\\begin{itemize}', '\\end{itemize}'),
'ol': ('\\begin{enumerate}', '\\end{enumerate}'),
'li': ('\\item ', ''),
'latex_eq': ('\\[', '\\]'),
}
def handle_table(table: bs4.element.Tag) -> str:
rows = table.find_all('tr')
latex_table: str = ''
for row in rows:
assert isinstance(row, bs4.element.Tag), 'HTML table not valid'
cells = row.find_all(['th', 'td'])
if not latex_table:
latex_table = "\\begin{tabular}{|" + "|".join(['l'] * len(cells)) + "|}\\toprule\n"
else:
latex_table += " & ".join(escape_text(cell.get_text(strip=True)) for cell in cells) + " \\\\\n"
latex_table += "\\bottomrule\n\\end{tabular}"
return latex_table
def parse_node(element: bs4.element.Tag) -> Iterator[str]:
prefix, post = html_to_latex.get(element.name, ('', ''))
yield prefix
for c in element.children:
if isinstance(c, bs4.element.Tag):
if c.name == 'table':
yield handle_table(c)
else:
yield from parse_node(c)
else:
yield escape_text(c.text)
yield post
return ''.join(parse_node(root))
def from_html(html_code: str) -> str: def from_html(html_code: str) -> str:
""" """
Converts HTML code to LaTeX code using HTMLParser. Converts HTML code to LaTeX code using HTMLParser.
@ -221,8 +182,11 @@ def from_html(html_code: str) -> str:
self.column_alignment = '' self.column_alignment = ''
self.midrule_flag = False self.midrule_flag = False
self.header_flag = False self.header_flag = False
self.attr_dict: dict[str, str] = {}
self.equation_flag = False
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
self.attr_dict = {k: v if v else '' for k, v in attrs}
if tag in html_to_latex: if tag in html_to_latex:
prefix, _ = html_to_latex[tag] prefix, _ = html_to_latex[tag]
self.latex_code.append(prefix) self.latex_code.append(prefix)
@ -234,15 +198,18 @@ def from_html(html_code: str) -> str:
elif tag == 'tr': elif tag == 'tr':
self.column_alignment = '' self.column_alignment = ''
elif tag in ['th', 'td']: elif tag in ['th', 'td']:
style = [v for k, v in attrs if k == 'style'] if 'right' in self.attr_dict.get('style', ''):
if style and style[0] and 'right' in style[0]:
self.column_alignment += 'r' self.column_alignment += 'r'
else: else:
self.column_alignment += 'l' self.column_alignment += 'l'
elif tag == 'a': elif tag == 'a':
href = [v for k, v in attrs if k == 'href'] href = self.attr_dict.get('href')
assert href, 'Link href attribute is missing' assert href, 'Link href attribute is missing'
self.latex_code.append(f"\\href{{{href[0]}}}{{") self.latex_code.append(f"\\href{{{href}}}{{")
elif tag == 'hr':
self.latex_code.append("\n\n\\noindent\\rule[0.5ex]{\\linewidth}{1pt}\n\n")
elif tag == 'latex':
self.equation_flag = True
def handle_endtag(self, tag: str) -> None: def handle_endtag(self, tag: str) -> None:
if tag in html_to_latex: if tag in html_to_latex:
@ -266,9 +233,16 @@ def from_html(html_code: str) -> str:
self.latex_code.append(" & ") self.latex_code.append(" & ")
elif tag == 'a': elif tag == 'a':
self.latex_code.append("}") self.latex_code.append("}")
elif tag == 'latex':
self.equation_flag = False
def handle_data(self, data: str) -> None: def handle_data(self, data: str) -> None:
if data.strip(): if self.equation_flag:
block = self.attr_dict.get('type') == 'block'
ref_id = self.attr_dict.get('ref_id', '')
ref_type = self.attr_dict.get('ref_type', 'eq')
self.latex_code.append(get_equation_code(data, ref_id, ref_type, block))
elif data.strip():
self.latex_code.append(escape_text(data)) self.latex_code.append(escape_text(data))
parser = LaTeXHTMLParser() parser = LaTeXHTMLParser()