From 44e192f275cb3cd13a9990d973da73b534ee5e91 Mon Sep 17 00:00:00 2001 From: Nicolas Kruse Date: Mon, 19 May 2025 11:15:42 +0200 Subject: [PATCH] renumbering of SVG-ids added to match HTML specification --- src/pyladoc/__init__.py | 59 +++++++++++++++++++++--------------- src/pyladoc/svg_tools.py | 38 +++++++++++++++++++++++ tests/document_validation.py | 7 ++--- 3 files changed, 75 insertions(+), 29 deletions(-) create mode 100644 src/pyladoc/svg_tools.py diff --git a/src/pyladoc/__init__.py b/src/pyladoc/__init__.py index 3f3c658..e30d379 100644 --- a/src/pyladoc/__init__.py +++ b/src/pyladoc/__init__.py @@ -8,6 +8,7 @@ from . import latex import pkgutil from html.parser import HTMLParser from io import StringIO +from . import svg_tools HTML_OUTPUT = 0 LATEX_OUTPUT = 1 @@ -58,17 +59,6 @@ def _markdown_to_html(text: str) -> str: return html_text -def _clean_svg(svg_text: str) -> str: - # remove all tags not alllowd for inline svg from metadata: - svg_text = re.sub(r'.*?', '', svg_text, flags=re.DOTALL) - - # remove illegal path-tags without d attribute: - return re.sub(r']*\sd=)\s.*?/>', '', svg_text, flags=re.DOTALL) - -# def _get_templ_vars(template: str) -> list[str]: -# return re.findall(".*?", template, re.DOTALL) - - def _drop_indent(text: str, amount: int) -> str: """ Drops a specific number of indentation spaces from a multiline text. @@ -142,6 +132,7 @@ def escape_html(text: str) -> str: def figure_to_string(fig: Figure, + unique_id: str, figure_format: FFormat = 'svg', font_family: str | None = None, scale: float = 1, @@ -175,7 +166,7 @@ def figure_to_string(fig: Figure, elif figure_format == 'svg' and not base64: i = buff.read(2028).find(b'{content}\n') + return (f'{content}') else: - return f'\n{content}\n' + return f'{content}' result = block_pattern.sub(block_repl, text) @@ -345,19 +336,19 @@ class DocumentWriter(): def inline_repl(match: re.Match[str]) -> str: content = match.group(1) return f'{content}' - + return inline_pattern.sub(inline_repl, result) def _get_equation_html(self, latex_equation: str, caption: str, reference: str, block: bool = False) -> str: fig = latex_to_figure(latex_equation) if block: - fig_str = figure_to_string(fig, self._figure_format, base64=self._base64_svgs) + fig_str = figure_to_string(fig, reference, self._figure_format, base64=self._base64_svgs) ret = ('
' f'
{fig_str}
' f'
{caption}
') else: - ret = '' + figure_to_string(fig, self._figure_format, base64=self._base64_svgs) + '' + ret = '' + figure_to_string(fig, reference, self._figure_format, base64=self._base64_svgs) + '' plt.close(fig) return ret @@ -373,34 +364,54 @@ class DocumentWriter(): self.eq_caption: str = '' self.reference: str = '' self.block: bool = False + self.p_tags: int = 0 self.dw = document_writer + self.latex_count = 0 + self.self_closing = False def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: if tag == 'hr': self.modified_html.write(f"<{tag}>") + self.self_closing = True elif tag == 'latex': self.in_latex = True attr_dict = {k: v if v else '' for k, v in attrs} self.eq_caption = attr_dict.get('caption', '') - self.reference = attr_dict.get('reference', '') + if 'reference' in attr_dict: + self.reference = attr_dict['reference'] + else: + self.latex_count += 1 + self.reference = f"auto_id_{self.latex_count}" self.block = attr_dict.get('type') == 'block' elif not self.in_latex: tag_text = self.get_starttag_text() + self.self_closing = tag_text.endswith('/>') if tag_text: self.modified_html.write(tag_text) + if tag == 'p': + self.p_tags += 1 def handle_data(self, data: str) -> None: if self.in_latex: - self.modified_html.write( - self.dw._get_equation_html(data, self.eq_caption, self.reference, self.block)) + eq_html = self.dw._get_equation_html(data, self.eq_caption, self.reference, self.block) + if self.p_tags > 0 and self.block: + # If a block equation (with divs) is inside a p tag: close and reopen it + self.modified_html.write(f"

{eq_html}

") + else: + self.modified_html.write(eq_html) + else: self.modified_html.write(data) def handle_endtag(self, tag: str) -> None: if tag == 'latex': self.in_latex = False + elif self.self_closing: + self.self_closing = False else: self.modified_html.write(f"") + if tag == 'p' and self.p_tags > 0: + self.p_tags -= 1 parser = HTMLPostProcessor(self) parser.feed(html_code) @@ -435,14 +446,14 @@ class DocumentWriter(): caption_prefix, reference = self._add_item(ref_id, ref_type, prefix_pattern) return '

%s%s
' % ( reference, - figure_to_string(fig, self._figure_format, base64=self._base64_svgs, scale=self._fig_scale), + figure_to_string(fig, reference, self._figure_format, base64=self._base64_svgs, scale=self._fig_scale), '
' + caption_prefix + escape_html(caption) if caption else '') def render_to_latex() -> str: _, reference = self._add_item(ref_id, ref_type, prefix_pattern) return '\\begin{figure}%s\n%s\n\\caption{%s}\n%s\\end{figure}' % ( '\n\\centering' if centered else '', - figure_to_string(fig, 'pgf', self._font_family, scale=self._fig_scale), + figure_to_string(fig, reference, 'pgf', self._font_family, scale=self._fig_scale), latex.escape_text(caption), '\\label{%s}\n' % latex.normalize_label_text(reference) if ref_id else '') @@ -603,7 +614,7 @@ class DocumentWriter(): norm_text = _normalize_text_indent(str(text)) def render_to_html() -> str: - html = self._html_post_processing(_markdown_to_html(self._equation_embedding_reescaping(norm_text))) + html = _markdown_to_html(self._equation_embedding_reescaping(norm_text)) if section_class: return '
' + html + '
' else: @@ -637,7 +648,7 @@ class DocumentWriter(): self._base64_svgs = base64_svgs self._fig_scale = figure_scale - return _fillin_reference_names(self._render_doc(HTML_OUTPUT), self._item_index) + return self._html_post_processing(_fillin_reference_names(self._render_doc(HTML_OUTPUT), self._item_index)) def to_latex(self, font_family: Literal[None, 'serif', 'sans-serif'] = None, table_renderer: TRenderer = 'simple', figure_scale: float = 1) -> str: diff --git a/src/pyladoc/svg_tools.py b/src/pyladoc/svg_tools.py new file mode 100644 index 0000000..35eda24 --- /dev/null +++ b/src/pyladoc/svg_tools.py @@ -0,0 +1,38 @@ +import xml.etree.ElementTree as ET +import re +from re import Match + +def update_svg_ids(input_svg: str, unique_id: str) -> str: + """Add a unique ID part to all svg IDs and update references ti these IDs""" + id_mapping: dict[str, str] = {} + + def update_ids(match: Match[str]) -> str: + old_id = match.group(1) + new_id = f"svg-{unique_id}-{old_id}" + id_mapping[old_id] = new_id + return f' id="{new_id}"' + + def update_references(match: Match[str]) -> str: + old_ref = match.group(1) + new_ref = id_mapping.get(old_ref, old_ref) + if match.group(0).startswith('xlink:href'): + return f'xlink:href="#{new_ref}"' + else: + return f'url(#{new_ref})' + + # Update IDs + svg_string = re.sub(r'\sid="(.*?)"', update_ids, input_svg) + + # Update references to IDs + svg_string = re.sub(r'url\(#([^\)]+)\)', update_references, svg_string) + svg_string = re.sub(r'xlink:href="#([^\"]+)"', update_references, svg_string) + + return svg_string + + +def clean_svg(svg_text: str) -> str: + # remove all tags not alllowd for inline svg from metadata: + svg_text = re.sub(r'.*?', '', svg_text, flags=re.DOTALL) + + # remove illegal path-tags without d attribute: + return re.sub(r']*\sd=)\s.*?/>', '', svg_text, flags=re.DOTALL) \ No newline at end of file diff --git a/tests/document_validation.py b/tests/document_validation.py index fe9eb80..01ba2a2 100644 --- a/tests/document_validation.py +++ b/tests/document_validation.py @@ -2,10 +2,7 @@ from typing import Generator, Any from lxml import etree from lxml.etree import _Element as EElement # type: ignore import requests - - -with open('src/pyladoc/templates/test_template.html', mode='rt', encoding='utf-8') as f: - html_test_template = f.read() +import pyladoc def add_line_numbers(multiline_string: str) -> str: @@ -53,7 +50,7 @@ def validate_html(html_string: str, validate_online: bool = False, check_for: li assert tag_type in tags, f"Tag {tag_type} not found in the html code" if validate_online: - test_page = html_test_template.replace('', html_string) + test_page = pyladoc.inject_to_template(html_string, internal_template='templates/test_template.html') validation_result = validate_html_with_w3c(test_page) assert 'messages' in validation_result, 'Validate request failed' if validation_result['messages']: