diff --git a/src/pyladoc/__init__.py b/src/pyladoc/__init__.py
index d01ab4b..0b64d25 100644
--- a/src/pyladoc/__init__.py
+++ b/src/pyladoc/__init__.py
@@ -6,7 +6,8 @@ import re
import io
from . import latex
import pkgutil
-
+from html.parser import HTMLParser
+from io import StringIO
HTML_OUTPUT = 0
LATEX_OUTPUT = 1
@@ -53,23 +54,8 @@ def _get_pkgutil_string(path: str) -> str:
def _markdown_to_html(text: str) -> str:
prep_text = re.sub(r'\u00A0', ' ', text) # non-breaking space
- html = markdown.markdown(prep_text, extensions=['tables', 'fenced_code', 'def_list', 'abbr', 'sane_lists'])
- return html.replace('
', '
')
-
-
-def escape_html(text: str) -> str:
- """
- Escapes special HTML characters in a given string.
-
- Args:
- text: The text to escape
-
- Returns:
- Escaped text save for inserting into HTML code
- """
- ret = re.sub(r'\u00A0', ' ', text) # non-breaking space
- ret = html.escape(ret)
- return ' '.join(ret.strip().splitlines())
+ html_text = markdown.markdown(prep_text, extensions=['tables', 'fenced_code', 'def_list', 'abbr', 'sane_lists'])
+ return html_text
def _clean_svg(svg_text: str) -> str:
@@ -140,6 +126,21 @@ def _save_figure(fig: Figure, buff: io.BytesIO, figure_format: FFormat, font_fam
fig.set_size_inches(old_size, None, False)
+def escape_html(text: str) -> str:
+ """
+ Escapes special HTML characters in a given string.
+
+ Args:
+ text: The text to escape
+
+ Returns:
+ Escaped text save for inserting into HTML code
+ """
+ ret = re.sub(r'\u00A0', ' ', text) # non-breaking space
+ ret = html.escape(ret)
+ return ' '.join(ret.strip().splitlines())
+
+
def figure_to_string(fig: Figure,
figure_format: FFormat = 'svg',
font_family: str | None = None,
@@ -305,6 +306,105 @@ class DocumentWriter():
self._item_count[ref_type] = current_index
return caption_prefix.format(current_index)
+ def _equation_embedding_reescaping(self, text: str) -> str:
+ """
+ Convert $$-escaping of LaTeX blocks and inline expressions
+ to a HTML-style format: ....
+ """
+ block_pattern = re.compile(
+ r'(^|\n)\s*\$\$\s*\n' # start delimiter on a line on its own
+ r'(?P.*?)' # capture block content non-greedily
+ r'\n\s*\$\$\s*(\n|$)', # end delimiter on a line on its own
+ re.DOTALL | re.MULTILINE
+ )
+
+ def block_repl(match: re.Match[str]) -> str:
+ content = match.group("content").strip()
+ latex_label: str = ''
+
+ label_pattern = re.compile(r'^\\label\{([^}]+)\}\s*\n?')
+ label_match = label_pattern.match(content)
+ if label_match:
+ latex_label = label_match.group(1)
+ # Remove the label command from the content.
+ content = content[label_match.end():].lstrip()
+
+ if latex_label and ':' in latex_label:
+ parts = latex_label.split(':')
+ ref_type = parts[0]
+ ref_id = parts[1]
+ caption = self._add_item(ref_id, ref_type, '({})')
+ return (f'\n{content}\n')
+ else:
+ return f'\n{content}\n'
+
+ result = block_pattern.sub(block_repl, text)
+
+ inline_pattern = re.compile(r'\$\$(.+?)\$\$')
+
+ def inline_repl(match: re.Match[str]) -> str:
+ content = match.group(1)
+ return f'{content}'
+
+ return inline_pattern.sub(inline_repl, result)
+
+ def _get_equation_html(self, latex_equation: str, caption: str, block: bool = False) -> str:
+ fig = latex_to_figure(latex_equation)
+ if block:
+ ret = ('') % (
+ figure_to_string(fig, self._figure_format, base64=self._base64_svgs),
+ caption)
+ else:
+ ret = '' + figure_to_string(fig, self._figure_format, base64=self._base64_svgs) + ''
+
+ plt.close(fig)
+ return ret
+
+ def _html_post_processing(self, html_code: str) -> str:
+ """
+ """
+ class HTMLPostProcessor(HTMLParser):
+ def __init__(self, document_writer: 'DocumentWriter') -> None:
+ super().__init__()
+ self.modified_html = StringIO()
+ self.in_latex: bool = False
+ self.eq_caption: str = ''
+ self.block: bool = False
+ self.dw = document_writer
+
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+ if tag == 'hr':
+ self.modified_html.write(f"<{tag}>")
+ elif tag == 'latex':
+ self.in_latex = True
+ attr_dict = {k: v if v else '' for k, v in attrs}
+ self.eq_caption = attr_dict.get('caption', '')
+ self.block = attr_dict.get('type') == 'block'
+ elif not self.in_latex:
+ tag_text = self.get_starttag_text()
+ if tag_text:
+ self.modified_html.write(tag_text)
+
+ def handle_data(self, data: str) -> None:
+ if self.in_latex:
+ self.modified_html.write(
+ self.dw._get_equation_html(data, self.eq_caption, self.block))
+ else:
+ self.modified_html.write(data)
+
+ def handle_endtag(self, tag: str) -> None:
+ if tag == 'latex':
+ self.in_latex = False
+ else:
+ self.modified_html.write(f"{tag}>")
+
+ parser = HTMLPostProcessor(self)
+ parser.feed(html_code)
+ return parser.modified_html.getvalue()
+
def new_field(self, name: str) -> 'DocumentWriter':
new_dwr = _create_document_writer()
self._fields[name] = new_dwr
@@ -318,7 +418,7 @@ class DocumentWriter():
centered: bool = True) -> None:
"""
Adds a diagram to the document.
-
+
Args:
fig: The figure to add (matplotlib figure)
caption: The caption for the figure
@@ -329,14 +429,15 @@ class DocumentWriter():
has an individual numbering
centered: Whether to center the figure in LaTeX output
"""
- caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
-
+
def render_to_html() -> str:
+ caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
return '%s%s
' % (
figure_to_string(fig, self._figure_format, base64=self._base64_svgs, scale=self._fig_scale),
'
' + caption_prefix + escape_html(caption) if caption else '')
def render_to_latex() -> str:
+ self._add_item(ref_id, ref_type, prefix_pattern)
return '\\begin{figure}%s\n%s\n\\caption{%s}\n%s\\end{figure}' % (
'\n\\centering' if centered else '',
figure_to_string(fig, 'pgf', self._font_family, scale=self._fig_scale),
@@ -361,25 +462,27 @@ class DocumentWriter():
centered: Whether to center the table in LaTeX output
"""
assert Table and isinstance(table, Table), 'Table has to be a pandas DataFrame oder DataFrame Styler'
- caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
styler = table if isinstance(table, Styler) else getattr(table, 'style', None)
assert isinstance(styler, Styler), 'Jinja2 package is required for rendering tables'
def render_to_html() -> str:
+ caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern)
html_string = styler.to_html(table_uuid=ref_id, caption=caption_prefix + escape_html(caption))
return re.sub(r'.*?', '', html_string, flags=re.DOTALL)
def render_to_latex() -> str:
+ self._add_item(ref_id, ref_type, prefix_pattern)
+ ref_label = latex.normalize_label_text(ref_type + ':' + ref_id)
if self._table_renderer == 'pandas':
return styler.to_latex(
- label=latex.normalize_label_text(ref_type + ':' + ref_id),
+ label=ref_label,
hrules=True,
convert_css=True,
siunitx=True,
caption=latex.escape_text(caption),
position_float='centering' if centered else None)
else:
- return latex.render_pandas_styler_table(styler, caption, ref_type + ':' + ref_id, centered)
+ return latex.render_pandas_styler_table(styler, caption, ref_label, centered)
self._doc.append([render_to_html, render_to_latex])
@@ -476,21 +579,14 @@ class DocumentWriter():
ref_id: If provided, the equation is displayed with
a number and can be referenced by the ref_id
"""
- caption = self._add_item(ref_id, ref_type, '({})')
def render_to_html() -> str:
- fig = latex_to_figure(latex_equation)
- return ('') % (
- figure_to_string(fig, self._figure_format, base64=self._base64_svgs),
- caption)
+ caption = self._add_item(ref_id, ref_type, '({})')
+ return self._get_equation_html(latex_equation, caption)
def render_to_latex() -> str:
- if ref_id:
- return '\\begin{equation}\\label{%s:%s}%s\\end{equation}' % (
- ref_type, ref_id, latex_equation)
- else:
- return '\\[%s\\]' % latex_equation
+ self._add_item(ref_id, ref_type, '')
+ return latex.get_equation_code(latex_equation, ref_type, ref_id)
self._doc.append([render_to_html, render_to_latex])
@@ -505,14 +601,16 @@ class DocumentWriter():
norm_text = _normalize_text_indent(str(text))
def render_to_html() -> str:
- html = _markdown_to_html(norm_text)
+ html = self._html_post_processing(_markdown_to_html(self._equation_embedding_reescaping(norm_text)))
if section_class:
return '' + html + '
'
else:
return html
def render_to_latex() -> str:
- return latex.from_html(render_to_html())
+ html = _markdown_to_html(
+ self._equation_embedding_reescaping(norm_text))
+ return latex.from_html(html)
self._doc.append([render_to_html, render_to_latex])
diff --git a/src/pyladoc/latex.py b/src/pyladoc/latex.py
index 62ef193..ae9f369 100644
--- a/src/pyladoc/latex.py
+++ b/src/pyladoc/latex.py
@@ -1,6 +1,5 @@
-import bs4
from html.parser import HTMLParser
-from typing import Iterator, Generator, Any
+from typing import Generator, Any
from pandas.io.formats.style import Styler
import re
import os
@@ -72,7 +71,7 @@ def escape_text(text: str) -> str:
for m in re.finditer(regex_filter, text):
s1, s2 = m.span()
ret.append(text[last_s:s1])
- matches = [v for k, v in LaTeX_translation.items() if re.match(k, m.group())]
+ matches = [v for k, v in latex_translation.items() if re.match(k, m.group())]
if m.group(1):
ret.append(matches[0].replace(r'\g<1>', normalize_label_text(m.group(1))))
else:
@@ -83,6 +82,25 @@ def escape_text(text: str) -> str:
return ''.join(ret)
+def get_equation_code(equation: str, ref_id: str, ref_type: str, block: bool = False) -> str:
+ """
+ Converts an equation string to LaTeX code.
+
+ Args:
+ equation: The LaTeX equation string.
+ ref_id: The reference ID for the equation.
+ ref_type: The type of reference (e.g., 'eq', 'fig', etc.).
+ """
+ if block:
+ if ref_id:
+ return '\\begin{equation}\\label{%s:%s}%s\\end{equation}' % (
+ normalize_label_text(ref_type), normalize_label_text(ref_id), equation)
+ else:
+ return '\\[%s\\]' % equation
+ else:
+ return '\\(%s\\)' % equation
+
+
def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str = '', centering: bool = True) -> str:
"""
Converts a pandas Styler object to LaTeX table.
@@ -132,63 +150,6 @@ def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str =
return ''.join(str_list)
-def from_html_old(html_code: str) -> str:
- """
- Converts HTML code to LaTeX code.
-
- Args:
- html_code: The HTML code to convert.
-
- Returns:
- The LaTeX code.
- """
- root = bs4.BeautifulSoup(html_code, 'html.parser')
-
- html_to_latex = {
- 'strong': ('\\textbf{', '}'),
- 'b': ('\\textbf{', '}'),
- 'em': ('\\emph{', '}'),
- 'i': ('\\emph{', '}'),
- 'p': ('', '\n\n'),
- 'h1': ('\\section{', '}'),
- 'h2': ('\\subsection{', '}'),
- 'h3': ('\\subsubsection{', '}'),
- 'ul': ('\\begin{itemize}', '\\end{itemize}'),
- 'ol': ('\\begin{enumerate}', '\\end{enumerate}'),
- 'li': ('\\item ', ''),
- 'latex_eq': ('\\[', '\\]'),
- }
-
- def handle_table(table: bs4.element.Tag) -> str:
- rows = table.find_all('tr')
- latex_table: str = ''
- for row in rows:
- assert isinstance(row, bs4.element.Tag), 'HTML table not valid'
- cells = row.find_all(['th', 'td'])
- if not latex_table:
- latex_table = "\\begin{tabular}{|" + "|".join(['l'] * len(cells)) + "|}\\toprule\n"
- else:
- latex_table += " & ".join(escape_text(cell.get_text(strip=True)) for cell in cells) + " \\\\\n"
- latex_table += "\\bottomrule\n\\end{tabular}"
- return latex_table
-
- def parse_node(element: bs4.element.Tag) -> Iterator[str]:
- prefix, post = html_to_latex.get(element.name, ('', ''))
- yield prefix
-
- for c in element.children:
- if isinstance(c, bs4.element.Tag):
- if c.name == 'table':
- yield handle_table(c)
- else:
- yield from parse_node(c)
- else:
- yield escape_text(c.text)
- yield post
-
- return ''.join(parse_node(root))
-
-
def from_html(html_code: str) -> str:
"""
Converts HTML code to LaTeX code using HTMLParser.
@@ -221,8 +182,11 @@ def from_html(html_code: str) -> str:
self.column_alignment = ''
self.midrule_flag = False
self.header_flag = False
+ self.attr_dict: dict[str, str] = {}
+ self.equation_flag = False
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+ self.attr_dict = {k: v if v else '' for k, v in attrs}
if tag in html_to_latex:
prefix, _ = html_to_latex[tag]
self.latex_code.append(prefix)
@@ -234,15 +198,18 @@ def from_html(html_code: str) -> str:
elif tag == 'tr':
self.column_alignment = ''
elif tag in ['th', 'td']:
- style = [v for k, v in attrs if k == 'style']
- if style and style[0] and 'right' in style[0]:
+ if 'right' in self.attr_dict.get('style', ''):
self.column_alignment += 'r'
else:
self.column_alignment += 'l'
elif tag == 'a':
- href = [v for k, v in attrs if k == 'href']
+ href = self.attr_dict.get('href')
assert href, 'Link href attribute is missing'
- self.latex_code.append(f"\\href{{{href[0]}}}{{")
+ self.latex_code.append(f"\\href{{{href}}}{{")
+ elif tag == 'hr':
+ self.latex_code.append("\n\n\\noindent\\rule[0.5ex]{\\linewidth}{1pt}\n\n")
+ elif tag == 'latex':
+ self.equation_flag = True
def handle_endtag(self, tag: str) -> None:
if tag in html_to_latex:
@@ -266,9 +233,16 @@ def from_html(html_code: str) -> str:
self.latex_code.append(" & ")
elif tag == 'a':
self.latex_code.append("}")
+ elif tag == 'latex':
+ self.equation_flag = False
def handle_data(self, data: str) -> None:
- if data.strip():
+ if self.equation_flag:
+ block = self.attr_dict.get('type') == 'block'
+ ref_id = self.attr_dict.get('ref_id', '')
+ ref_type = self.attr_dict.get('ref_type', 'eq')
+ self.latex_code.append(get_equation_code(data, ref_id, ref_type, block))
+ elif data.strip():
self.latex_code.append(escape_text(data))
parser = LaTeXHTMLParser()