diff --git a/src/pyladoc/__init__.py b/src/pyladoc/__init__.py index d01ab4b..0b64d25 100644 --- a/src/pyladoc/__init__.py +++ b/src/pyladoc/__init__.py @@ -6,7 +6,8 @@ import re import io from . import latex import pkgutil - +from html.parser import HTMLParser +from io import StringIO HTML_OUTPUT = 0 LATEX_OUTPUT = 1 @@ -53,23 +54,8 @@ def _get_pkgutil_string(path: str) -> str: def _markdown_to_html(text: str) -> str: prep_text = re.sub(r'\u00A0', ' ', text) # non-breaking space - html = markdown.markdown(prep_text, extensions=['tables', 'fenced_code', 'def_list', 'abbr', 'sane_lists']) - return html.replace('
', '
') - - -def escape_html(text: str) -> str: - """ - Escapes special HTML characters in a given string. - - Args: - text: The text to escape - - Returns: - Escaped text save for inserting into HTML code - """ - ret = re.sub(r'\u00A0', ' ', text) # non-breaking space - ret = html.escape(ret) - return ' '.join(ret.strip().splitlines()) + html_text = markdown.markdown(prep_text, extensions=['tables', 'fenced_code', 'def_list', 'abbr', 'sane_lists']) + return html_text def _clean_svg(svg_text: str) -> str: @@ -140,6 +126,21 @@ def _save_figure(fig: Figure, buff: io.BytesIO, figure_format: FFormat, font_fam fig.set_size_inches(old_size, None, False) +def escape_html(text: str) -> str: + """ + Escapes special HTML characters in a given string. + + Args: + text: The text to escape + + Returns: + Escaped text save for inserting into HTML code + """ + ret = re.sub(r'\u00A0', ' ', text) # non-breaking space + ret = html.escape(ret) + return ' '.join(ret.strip().splitlines()) + + def figure_to_string(fig: Figure, figure_format: FFormat = 'svg', font_family: str | None = None, @@ -305,6 +306,105 @@ class DocumentWriter(): self._item_count[ref_type] = current_index return caption_prefix.format(current_index) + def _equation_embedding_reescaping(self, text: str) -> str: + """ + Convert $$-escaping of LaTeX blocks and inline expressions + to a HTML-style format: .... + """ + block_pattern = re.compile( + r'(^|\n)\s*\$\$\s*\n' # start delimiter on a line on its own + r'(?P.*?)' # capture block content non-greedily + r'\n\s*\$\$\s*(\n|$)', # end delimiter on a line on its own + re.DOTALL | re.MULTILINE + ) + + def block_repl(match: re.Match[str]) -> str: + content = match.group("content").strip() + latex_label: str = '' + + label_pattern = re.compile(r'^\\label\{([^}]+)\}\s*\n?') + label_match = label_pattern.match(content) + if label_match: + latex_label = label_match.group(1) + # Remove the label command from the content. + content = content[label_match.end():].lstrip() + + if latex_label and ':' in latex_label: + parts = latex_label.split(':') + ref_type = parts[0] + ref_id = parts[1] + caption = self._add_item(ref_id, ref_type, '({})') + return (f'\n{content}\n') + else: + return f'\n{content}\n' + + result = block_pattern.sub(block_repl, text) + + inline_pattern = re.compile(r'\$\$(.+?)\$\$') + + def inline_repl(match: re.Match[str]) -> str: + content = match.group(1) + return f'{content}' + + return inline_pattern.sub(inline_repl, result) + + def _get_equation_html(self, latex_equation: str, caption: str, block: bool = False) -> str: + fig = latex_to_figure(latex_equation) + if block: + ret = ('
' + '
%s
' + '
%s
') % ( + figure_to_string(fig, self._figure_format, base64=self._base64_svgs), + caption) + else: + ret = '' + figure_to_string(fig, self._figure_format, base64=self._base64_svgs) + '' + + plt.close(fig) + return ret + + def _html_post_processing(self, html_code: str) -> str: + """ + """ + class HTMLPostProcessor(HTMLParser): + def __init__(self, document_writer: 'DocumentWriter') -> None: + super().__init__() + self.modified_html = StringIO() + self.in_latex: bool = False + self.eq_caption: str = '' + self.block: bool = False + self.dw = document_writer + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag == 'hr': + self.modified_html.write(f"<{tag}>") + elif tag == 'latex': + self.in_latex = True + attr_dict = {k: v if v else '' for k, v in attrs} + self.eq_caption = attr_dict.get('caption', '') + self.block = attr_dict.get('type') == 'block' + elif not self.in_latex: + tag_text = self.get_starttag_text() + if tag_text: + self.modified_html.write(tag_text) + + def handle_data(self, data: str) -> None: + if self.in_latex: + self.modified_html.write( + self.dw._get_equation_html(data, self.eq_caption, self.block)) + else: + self.modified_html.write(data) + + def handle_endtag(self, tag: str) -> None: + if tag == 'latex': + self.in_latex = False + else: + self.modified_html.write(f"") + + parser = HTMLPostProcessor(self) + parser.feed(html_code) + return parser.modified_html.getvalue() + def new_field(self, name: str) -> 'DocumentWriter': new_dwr = _create_document_writer() self._fields[name] = new_dwr @@ -318,7 +418,7 @@ class DocumentWriter(): centered: bool = True) -> None: """ Adds a diagram to the document. - + Args: fig: The figure to add (matplotlib figure) caption: The caption for the figure @@ -329,14 +429,15 @@ class DocumentWriter(): has an individual numbering centered: Whether to center the figure in LaTeX output """ - caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern) - + def render_to_html() -> str: + caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern) return '
%s%s
' % ( figure_to_string(fig, self._figure_format, base64=self._base64_svgs, scale=self._fig_scale), '
' + caption_prefix + escape_html(caption) if caption else '') def render_to_latex() -> str: + self._add_item(ref_id, ref_type, prefix_pattern) return '\\begin{figure}%s\n%s\n\\caption{%s}\n%s\\end{figure}' % ( '\n\\centering' if centered else '', figure_to_string(fig, 'pgf', self._font_family, scale=self._fig_scale), @@ -361,25 +462,27 @@ class DocumentWriter(): centered: Whether to center the table in LaTeX output """ assert Table and isinstance(table, Table), 'Table has to be a pandas DataFrame oder DataFrame Styler' - caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern) styler = table if isinstance(table, Styler) else getattr(table, 'style', None) assert isinstance(styler, Styler), 'Jinja2 package is required for rendering tables' def render_to_html() -> str: + caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern) html_string = styler.to_html(table_uuid=ref_id, caption=caption_prefix + escape_html(caption)) return re.sub(r'.*?', '', html_string, flags=re.DOTALL) def render_to_latex() -> str: + self._add_item(ref_id, ref_type, prefix_pattern) + ref_label = latex.normalize_label_text(ref_type + ':' + ref_id) if self._table_renderer == 'pandas': return styler.to_latex( - label=latex.normalize_label_text(ref_type + ':' + ref_id), + label=ref_label, hrules=True, convert_css=True, siunitx=True, caption=latex.escape_text(caption), position_float='centering' if centered else None) else: - return latex.render_pandas_styler_table(styler, caption, ref_type + ':' + ref_id, centered) + return latex.render_pandas_styler_table(styler, caption, ref_label, centered) self._doc.append([render_to_html, render_to_latex]) @@ -476,21 +579,14 @@ class DocumentWriter(): ref_id: If provided, the equation is displayed with a number and can be referenced by the ref_id """ - caption = self._add_item(ref_id, ref_type, '({})') def render_to_html() -> str: - fig = latex_to_figure(latex_equation) - return ('
%s
' - '
%s
') % ( - figure_to_string(fig, self._figure_format, base64=self._base64_svgs), - caption) + caption = self._add_item(ref_id, ref_type, '({})') + return self._get_equation_html(latex_equation, caption) def render_to_latex() -> str: - if ref_id: - return '\\begin{equation}\\label{%s:%s}%s\\end{equation}' % ( - ref_type, ref_id, latex_equation) - else: - return '\\[%s\\]' % latex_equation + self._add_item(ref_id, ref_type, '') + return latex.get_equation_code(latex_equation, ref_type, ref_id) self._doc.append([render_to_html, render_to_latex]) @@ -505,14 +601,16 @@ class DocumentWriter(): norm_text = _normalize_text_indent(str(text)) def render_to_html() -> str: - html = _markdown_to_html(norm_text) + html = self._html_post_processing(_markdown_to_html(self._equation_embedding_reescaping(norm_text))) if section_class: return '
' + html + '
' else: return html def render_to_latex() -> str: - return latex.from_html(render_to_html()) + html = _markdown_to_html( + self._equation_embedding_reescaping(norm_text)) + return latex.from_html(html) self._doc.append([render_to_html, render_to_latex]) diff --git a/src/pyladoc/latex.py b/src/pyladoc/latex.py index 62ef193..ae9f369 100644 --- a/src/pyladoc/latex.py +++ b/src/pyladoc/latex.py @@ -1,6 +1,5 @@ -import bs4 from html.parser import HTMLParser -from typing import Iterator, Generator, Any +from typing import Generator, Any from pandas.io.formats.style import Styler import re import os @@ -72,7 +71,7 @@ def escape_text(text: str) -> str: for m in re.finditer(regex_filter, text): s1, s2 = m.span() ret.append(text[last_s:s1]) - matches = [v for k, v in LaTeX_translation.items() if re.match(k, m.group())] + matches = [v for k, v in latex_translation.items() if re.match(k, m.group())] if m.group(1): ret.append(matches[0].replace(r'\g<1>', normalize_label_text(m.group(1)))) else: @@ -83,6 +82,25 @@ def escape_text(text: str) -> str: return ''.join(ret) +def get_equation_code(equation: str, ref_id: str, ref_type: str, block: bool = False) -> str: + """ + Converts an equation string to LaTeX code. + + Args: + equation: The LaTeX equation string. + ref_id: The reference ID for the equation. + ref_type: The type of reference (e.g., 'eq', 'fig', etc.). + """ + if block: + if ref_id: + return '\\begin{equation}\\label{%s:%s}%s\\end{equation}' % ( + normalize_label_text(ref_type), normalize_label_text(ref_id), equation) + else: + return '\\[%s\\]' % equation + else: + return '\\(%s\\)' % equation + + def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str = '', centering: bool = True) -> str: """ Converts a pandas Styler object to LaTeX table. @@ -132,63 +150,6 @@ def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str = return ''.join(str_list) -def from_html_old(html_code: str) -> str: - """ - Converts HTML code to LaTeX code. - - Args: - html_code: The HTML code to convert. - - Returns: - The LaTeX code. - """ - root = bs4.BeautifulSoup(html_code, 'html.parser') - - html_to_latex = { - 'strong': ('\\textbf{', '}'), - 'b': ('\\textbf{', '}'), - 'em': ('\\emph{', '}'), - 'i': ('\\emph{', '}'), - 'p': ('', '\n\n'), - 'h1': ('\\section{', '}'), - 'h2': ('\\subsection{', '}'), - 'h3': ('\\subsubsection{', '}'), - 'ul': ('\\begin{itemize}', '\\end{itemize}'), - 'ol': ('\\begin{enumerate}', '\\end{enumerate}'), - 'li': ('\\item ', ''), - 'latex_eq': ('\\[', '\\]'), - } - - def handle_table(table: bs4.element.Tag) -> str: - rows = table.find_all('tr') - latex_table: str = '' - for row in rows: - assert isinstance(row, bs4.element.Tag), 'HTML table not valid' - cells = row.find_all(['th', 'td']) - if not latex_table: - latex_table = "\\begin{tabular}{|" + "|".join(['l'] * len(cells)) + "|}\\toprule\n" - else: - latex_table += " & ".join(escape_text(cell.get_text(strip=True)) for cell in cells) + " \\\\\n" - latex_table += "\\bottomrule\n\\end{tabular}" - return latex_table - - def parse_node(element: bs4.element.Tag) -> Iterator[str]: - prefix, post = html_to_latex.get(element.name, ('', '')) - yield prefix - - for c in element.children: - if isinstance(c, bs4.element.Tag): - if c.name == 'table': - yield handle_table(c) - else: - yield from parse_node(c) - else: - yield escape_text(c.text) - yield post - - return ''.join(parse_node(root)) - - def from_html(html_code: str) -> str: """ Converts HTML code to LaTeX code using HTMLParser. @@ -221,8 +182,11 @@ def from_html(html_code: str) -> str: self.column_alignment = '' self.midrule_flag = False self.header_flag = False + self.attr_dict: dict[str, str] = {} + self.equation_flag = False def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + self.attr_dict = {k: v if v else '' for k, v in attrs} if tag in html_to_latex: prefix, _ = html_to_latex[tag] self.latex_code.append(prefix) @@ -234,15 +198,18 @@ def from_html(html_code: str) -> str: elif tag == 'tr': self.column_alignment = '' elif tag in ['th', 'td']: - style = [v for k, v in attrs if k == 'style'] - if style and style[0] and 'right' in style[0]: + if 'right' in self.attr_dict.get('style', ''): self.column_alignment += 'r' else: self.column_alignment += 'l' elif tag == 'a': - href = [v for k, v in attrs if k == 'href'] + href = self.attr_dict.get('href') assert href, 'Link href attribute is missing' - self.latex_code.append(f"\\href{{{href[0]}}}{{") + self.latex_code.append(f"\\href{{{href}}}{{") + elif tag == 'hr': + self.latex_code.append("\n\n\\noindent\\rule[0.5ex]{\\linewidth}{1pt}\n\n") + elif tag == 'latex': + self.equation_flag = True def handle_endtag(self, tag: str) -> None: if tag in html_to_latex: @@ -266,9 +233,16 @@ def from_html(html_code: str) -> str: self.latex_code.append(" & ") elif tag == 'a': self.latex_code.append("}") + elif tag == 'latex': + self.equation_flag = False def handle_data(self, data: str) -> None: - if data.strip(): + if self.equation_flag: + block = self.attr_dict.get('type') == 'block' + ref_id = self.attr_dict.get('ref_id', '') + ref_type = self.attr_dict.get('ref_type', 'eq') + self.latex_code.append(get_equation_code(data, ref_id, ref_type, block)) + elif data.strip(): self.latex_code.append(escape_text(data)) parser = LaTeXHTMLParser()