commit 67c48776acb4d010162677ef829670d6967bb640 Author: Nicolas Date: Fri Mar 28 13:30:08 2025 +0100 first commit diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..eb1eb4f --- /dev/null +++ b/.flake8 @@ -0,0 +1,21 @@ +[flake8] +# Specify the maximum allowed line length +max-line-length = 88 + +# Ignore specific rules +# For example, E501: Line too long, W503: Line break before binary operator +ignore = E501, W503, W504 + +# Exclude specific files or directories +exclude = + .git, + __pycache__, + build, + dist, + .conda + .venv + venv + +# Enable specific plugins or options +# Example: Enabling flake8-docstrings +select = C,E,F,W,D \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..373d2df --- /dev/null +++ b/.gitignore @@ -0,0 +1,134 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +.venv/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +pyModbusTCP_old/ +test.py +test_*.ipynb +settings.json \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..68e35a8 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# Pyladoc + +## Description +Pyladoc is a python package for programmatically generating HTML and +PDF/LaTex output. This package targets specifically applications where reports +or results with Pandas-tables and Matplotlib-figures are generated programmatically +to be displayed as website and as PDF document without any manual formatting +steps. + +This package focuses on the "Document in Code" approach for cases +where a lot of calculations and data handling is done but not a lot of +document text needs to be displayed. + +As backend for PDF generation LaTex is used. There are excellent engines for +rendering HTML to PDF available, but even if there is no requirement for an +accurate typesetting, placing programmatically content of variable +composition and sizes on fixed size pages without manual intervention +is a hard problem that LaTeX is very capable of. + +### Sported primitives +- Text (can be Markdown or HTML formatted) +- Headings +- Tables (Pandas, Markdown or HTML) +- Matplotlib figures +- LaTex equations +- Named references for figures, tables and equation + +### Key Features +- HTML and PDF/LaTex rendering of the same document +- Single file output including figures +- Figure and equation embedding in HTML by inline SVG, SVG in Base64 or PNG in Base64 +- Figure embedding in LaTex as PGF/TikZ + +### Usage Scenarios +- Webservices +- Report generation for lab equipment + +## Installation +It can be installed with pip: + +```bash +pip install pyladoc +``` + +## Usage +It is easy to use as the following example code shows: + +```python +import pyladoc + +doc = pyladoc.DocumentWriter() + +doc.add_markdown(""" + # Example + This is an example. The @table:pandas_example shows some random data. + """) + +some_data = { + 'Row1': ["Line1", "Line2", "Line3"], + 'Row2': [120, 100, 110], + 'Row3': ['12 g/km', '> 150 g/km', '110 g/km'] +} +df = pd.DataFrame(some_data) +dw.add_table(df, 'This is a pandas example table', 'pandas_example') + +html_code = doc.to_html() + +doc.to_pdf('test.pdf') +``` + +## Example outputs +The following documents are generated by tests/test_rendering_example_doc.py: + +- HTML: [test_html_render.html](tests/out/test_html_render.html) +- PDF: [test_latex_render.pdf](tests/out/test_latex_render.pdf) + +## Contributing +Contributions are welcome, please open an issue or submit a pull request on GitHub. + +## Developer Guide +To get started with developing the `pyladoc` package, follow these steps. + +First, clone the repository to your local machine using Git: + +```bash +git clone https://github.com/Nonannet/pyladoc.git +cd pyladoc +``` + +It's recommended to setup an venv: + +```bash +python -m venv venv +source venv/bin/activate # On Windows use `venv\Scripts\activate` +``` + +Install the package and dev-dependencies while keeping files in the +current directory: + +```bash +pip install -e .[dev] +``` + +Ensure that everything is set up correctly by running the tests: + +```bash +pytest +``` + +## License +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..38b877d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,58 @@ +[project] +name = "pyladoc" +version = "1.0.0" +authors = [ + { name="Nicolas Kruse", email="nicolas.kruse@nonan.net" }, +] +description = "Package for generating HTML and PDF/latex from python code" +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dependencies = [ + "markdown>=3.3.0", + "beautifulsoup4>=4.9.1" +] + +[project.optional-dependencies] +dev = [ + "pytest", "flake8", "mypy", + "lxml", "types-lxml", + "requests", + "matplotlib>=3.1.1", + "pandas>=2.0.0", "Jinja2", +] + +[project.urls] +Homepage = "https://github.com/Nonannet/pyladoc" +Repository = "https://github.com/Nonannet/pyladoc" +Issues = "https://github.com/Nonannet/pyladoc/issues" + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +pyladoc = ["templates/*"] + + +[tool.mypy] +files = ["src"] +strict = true +warn_return_any = true +warn_unused_configs = true +check_untyped_defs = true +no_implicit_optional = true +show_error_codes = true + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-ra -q" +testpaths = ["tests"] +pythonpath = ["src"] \ No newline at end of file diff --git a/src/pyladoc/__init__.py b/src/pyladoc/__init__.py new file mode 100644 index 0000000..6b7d44a --- /dev/null +++ b/src/pyladoc/__init__.py @@ -0,0 +1,570 @@ +from typing import Callable, Generator, Self, Literal, TYPE_CHECKING +import html +import markdown +from base64 import b64encode +import re +import io +from . import latex +import pkgutil + + +HTML_OUTPUT = 0 +LATEX_OUTPUT = 1 + +if TYPE_CHECKING: + from pandas import DataFrame + from pandas.io.formats.style import Styler + import matplotlib.pyplot as plt + from matplotlib.figure import Figure + from matplotlib.legend import Legend as Mpl_Legend + from matplotlib.text import Text as Mpl_Text + + Table = DataFrame | Styler +else: + try: + from pandas import DataFrame + except ImportError: + DataFrame = None + + try: + from pandas.io.formats.style import Styler + Table = DataFrame | Styler + except ImportError: + Table = DataFrame + + try: + import matplotlib.pyplot as plt + from matplotlib.figure import Figure + from matplotlib.legend import Legend as Mpl_Legend + from matplotlib.text import Text as Mpl_Text + except ImportError: + Figure = None + + +TRenderer = Literal['pandas', 'simple'] +FFormat = Literal['svg', 'png', 'pgf'] + + +def _get_pkgutil_string(path: str) -> str: + data = pkgutil.get_data(__name__, path) + assert data is not None + return data.decode() + + +def _markdown_to_html(text: str) -> str: + prep_text = re.sub(r'\u00A0', ' ', text) # non-breaking space + html = markdown.markdown(prep_text, extensions=['tables', 'fenced_code', 'def_list', 'abbr', 'sane_lists']) + return html.replace('
', '
') + + +def escape_html(text: str) -> str: + """ + Escapes special HTML characters in a given string. + + Args: + text: The text to escape + + Returns: + Escaped text save for inserting into HTML code + """ + ret = re.sub(r'\u00A0', ' ', text) # non-breaking space + ret = html.escape(ret) + return ' '.join(ret.strip().splitlines()) + + +def _clean_svg(svg_text: str) -> str: + # remove all tags not alllowd for inline svg from metadata: + svg_text = re.sub(r'.*?', '', svg_text, flags=re.DOTALL) + + # remove illegal path-tags without d attribute: + return re.sub(r']*\sd=)\s.*?/>', '', svg_text, flags=re.DOTALL) + +# def _get_templ_vars(template: str) -> list[str]: +# return re.findall(".*?", template, re.DOTALL) + + +def _drop_indent(text: str, amount: int) -> str: + """ + Drops a specific number of indentation spaces from a multiline text. + + Args: + text: The text to drop indentation from + amount: The number of indentation space characters to drop + + Returns: + The text with the specified amount of indentation removed + """ + return ''.join(' ' * amount + line for line in text.splitlines(True)) + + +def _save_figure(fig: Figure, buff: io.BytesIO, figure_format: FFormat, font_family: str | None, scale: float) -> None: + """ + Saves a matplotlib figure to a file-like object. + + Args: + fig: The figure to save + buff: The file-like object to save the figure to + figure_format: The format to save the figure in (svg, png or pgf) + font_family: The font family to use for the figure + """ + def get_all_elements() -> Generator[Mpl_Text, None, None]: + for ax in fig.get_axes(): + yield ax.title + yield ax.xaxis.label + yield ax.yaxis.label + yield from ax.get_xticklabels() + ax.get_yticklabels() + legend: Mpl_Legend = ax.get_legend() + if legend: + yield from legend.get_texts() + + # Store current figure settings + old_state = ((e, e.get_fontfamily()) for e in get_all_elements()) + old_size: tuple[float, float] = tuple(fig.get_size_inches()) # type: ignore[unused-ignore] + + # Adjust figure settings + if font_family: + for e, _ in old_state: + e.set_fontfamily(font_family) + + fig.set_size_inches(old_size[0] * scale, old_size[1] * scale, False) + + # Render figure + backends = {'png': 'AGG', 'svg': 'SVG', 'pgf': 'PGF'} + assert figure_format in backends, 'Figure format can be pgf (vector), svg (vector) or png (raster)' + fig.savefig(buff, format=figure_format, backend=backends[figure_format]) # type: ignore[unused-ignore] + + # Reset figure setting + for e, s in old_state: + e.set_fontfamily(s) + + fig.set_size_inches(old_size, None, False) + + +def figure_to_string(fig: Figure, + figure_format: FFormat = 'svg', + font_family: str | None = None, + scale: float = 1, + alt_text: str = '', + base64: bool = False) -> str: + """ + Converts a matplotlib figure to a ascii-string. For png base64 encoding is + used in general, for svg base64 encoding can be enabled. For base64 encoded + figures a img-tag is included in the output. + + Args: + fig: The figure to convert + figure_format: The format to save the figure in (svg, png or pgf) + font_family: The font family to use for the figure + scale: Scaling factor for the figure size + alt_text: The alt text for the figure + base64: If the format is svg this determine if the image is encode in base64 + + Returns: + The figure as ascii-string + """ + assert fig and isinstance(fig, Figure), 'fig parameter must be a matplotlib figure' + with io.BytesIO() as buff: + _save_figure(fig, buff, figure_format, font_family, scale) + buff.seek(0) + if figure_format == 'pgf': + i = buff.read(2028).find(b'\\begingroup%') # skip comments + buff.seek(max(i, 0)) + return latex.to_ascii(buff.read().decode('utf-8')) + + elif figure_format == 'svg' and not base64: + i = buff.read(2028).find(b'' % \ + (escape_html(alt_text), + image_mime[figure_format], + b64encode(buff.read()).decode('ascii')) # base64 assures (7-bit) ascii + + +def latex_to_figure(latex_code: str) -> Figure: + assert Figure, 'Matplotlib is required for rendering LaTex expressions for HTML output.' # type:ignore[truthy-function] + fig, ax = plt.subplots() + ax.set_xticks([]) + ax.set_yticks([]) + ax.axis('off') + text = plt.text(0.5, 0.5, f'${latex_code}$', horizontalalignment='center', + verticalalignment='center', transform=ax.transAxes) + fig.draw_without_rendering() + bbox = text.get_window_extent() + fig.set_size_inches(bbox.width / fig.dpi * 1.2, bbox.height / fig.dpi * 1.2) + return fig + + +def _fillin_fields(template: str, fields: dict[str, str]) -> str: + html_out = template + for variable_name, value in fields.items(): + # Find indentation depths: + ret = re.search(f"^(.*?).*?", html_out, flags=re.MULTILINE) + if ret: + indent_depths = len(ret.group(1)) + html_out = html_out[:ret.start(0)] + _drop_indent(value, indent_depths) + html_out[ret.end(0):] + return html_out + + +def _fillin_reference_names(input_string: str, item_index: dict[str, int]) -> str: + replacements = [(*m.span(), m.group()) for m in re.finditer(r'(?<=@)\w+:[\w\_\-]+', input_string)] + ret: list[str] = [] + current_pos = 0 + for start, end, ref in replacements: + assert ref in item_index, f"Reference {ref} does not exist in the document" + ret.append(input_string[current_pos:start - 1]) + ret.append(str(item_index[ref])) + current_pos = end + return ''.join(ret) + input_string[current_pos:] + + +def _check_latex_references(input_string: str, item_index: dict[str, int]) -> str: + replacements = [m.group() for m in re.finditer(r'(?<=\\ref\{)\w+:[\w\_\\\-]+(?=\})', input_string)] + escaped_items = set(latex.normalize_label_text(item) for item in item_index) + for ref in replacements: + assert ref in escaped_items, f"Reference {ref} does not exist in the document" + return input_string + + +def _normalize_text_indent(text: str) -> str: + text_lines = text.splitlines() + if len(text_lines) > 1 and not text_lines[0].strip(): + text_lines = text_lines[1:] + + if not text_lines: + return '' + + if len(text_lines) > 1 and text_lines[0] and text_lines[0][0] != ' ': + indent_amount = len(text_lines[1]) - len(text_lines[1].lstrip()) + else: + indent_amount = len(text_lines[0]) - len(text_lines[0].lstrip()) + + return '\n'.join( + [' ' * max(0, len(line) - len(line.strip()) - indent_amount) + line.strip() + for line in text_lines]) + + +def _create_document_writer() -> 'DocumentWriter': + new_dwr = DocumentWriter() + return new_dwr + + +def inject_to_template(content: str, template_path: str = '', internal_template: str = '') -> str: + """ + injects a content string into a template. The placeholder + will be replaced by the content. If the placeholder is prefixed with a + '%' comment character, this character will be replaced as well. + + Args: + template_path: Path to a template file + internal_template: Path to a internal default template + + Returns: + Template with included content + """ + if template_path: + with open(template_path, 'r') as f: + template = f.read() + elif internal_template: + template = _get_pkgutil_string(internal_template) + else: + raise Exception('No template provided') + + assert '' in template, 'No expression in template located' + prep_template = re.sub(r"\%?\s*", '', template) + return prep_template.replace('', content) + + +class DocumentWriter(): + """ + A class to create a document for exporting to HTML or LaTeX. + """ + def __init__(self) -> None: + self._doc: list[list[Callable[[], str]]] = [] + self._fields: dict[str, DocumentWriter] = dict() + self._base64_svgs: bool = False + self._figure_format: FFormat = 'svg' + self._table_renderer: TRenderer = 'simple' + self._font_family: str | None = None + self._item_count: dict[str, int] = {} + self._item_index: dict[str, int] = {} + self._fig_scale: float = 1 + + def _add_item(self, ref_id: str, ref_type: str, caption_prefix: str) -> str: + current_index = self._item_count.get(ref_type, 0) + 1 + if not ref_id: + ref_id = str(current_index) + self._item_index[f"{ref_type}:{ref_id}"] = current_index + self._item_count[ref_type] = current_index + return caption_prefix.format(current_index) + + def new_field(self, name: str) -> 'DocumentWriter': + new_dwr = _create_document_writer() + self._fields[name] = new_dwr + return new_dwr + + def add_document(self, doc: Self) -> None: + self._doc += doc._doc + + def add_diagram(self, fig: Figure, caption: str = '', ref_id: str = '', + prefix_pattern: str = 'Figure {}: ', ref_type: str = 'fig', + centered: bool = True) -> None: + caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern) + + def render_to_html() -> str: + return '
%s%s
' % ( + figure_to_string(fig, self._figure_format, base64=self._base64_svgs, scale=self._fig_scale), + '
' + caption_prefix + escape_html(caption) if caption else '') + + def render_to_latex() -> str: + return '\\begin{figure}%s\n%s\n\\caption{%s}\n%s\\end{figure}' % ( + '\n\\centering' if centered else '', + figure_to_string(fig, 'pgf', self._font_family, scale=self._fig_scale), + latex.escape_text(caption), + '\\label{%s}\n' % latex.normalize_label_text(ref_type + ':' + ref_id) if ref_id else '') + + self._doc.append([render_to_html, render_to_latex]) + + def add_table(self, table: Table, caption: str = '', ref_id: str = '', + prefix_pattern: str = 'Table {}: ', ref_type: str = 'table', centered: bool = True) -> None: + assert Table and isinstance(table, Table), 'Table has to be a pandas DataFrame oder DataFrame Styler' + caption_prefix = self._add_item(ref_id, ref_type, prefix_pattern) + styler = table if isinstance(table, Styler) else getattr(table, 'style', None) + assert isinstance(styler, Styler), 'Jinja2 package is required for rendering tables' + + def render_to_html() -> str: + html_string = styler.to_html(table_uuid=ref_id, caption=caption_prefix + escape_html(caption)) + return re.sub(r'.*?', '', html_string, flags=re.DOTALL) + + def render_to_latex() -> str: + if self._table_renderer == 'pandas': + return styler.to_latex( + label=latex.normalize_label_text(ref_type + ':' + ref_id), + hrules=True, + convert_css=True, + siunitx=True, + caption=latex.escape_text(caption), + position_float='centering' if centered else None) + else: + return latex.render_pandas_styler_table(styler, caption, ref_type + ':' + ref_id, centered) + + self._doc.append([render_to_html, render_to_latex]) + + def add_text(self, text: str, section_class: str = '') -> None: + """ + Adds a text paragraph to the document. + + Args: + text: The text to add + section_class: The class for the paragraph + """ + norm_text = _normalize_text_indent(text) + + def render_to_html() -> str: + html = '

' + escape_html(norm_text) + '

' + if section_class: + return '
' + html + '
' + else: + return html + + def render_to_latex() -> str: + return latex.from_html(render_to_html()) + + self._doc.append([render_to_html, render_to_latex]) + + def add_html(self, text: str) -> None: + """ + Adds HTML formatted text to the document. For the LaTeX + export only basic HTML for text formatting and tables + is supported. + + Args: + text: The HTML to add to the document + """ + def render_to_html() -> str: + return text + + def render_to_latex() -> str: + return latex.from_html(text) + + self._doc.append([render_to_html, render_to_latex]) + + def add_h1(self, text: str) -> None: + """ + Adds a h1 heading to the document. + + Args: + text: The text of the heading + """ + def render_to_html() -> str: + return '

' + escape_html(text) + '

' + + def render_to_latex() -> str: + return '\\section{' + latex.escape_text(text) + '}\n' + + self._doc.append([render_to_html, render_to_latex]) + + def add_h2(self, text: str) -> None: + """ + Adds a h2 heading to the document. + + Args: + text: The text of the heading + """ + def render_to_html() -> str: + return '

' + escape_html(text) + '

' + + def render_to_latex() -> str: + return '\\subsection{' + latex.escape_text(text) + '}\n' + + self._doc.append([render_to_html, render_to_latex]) + + def add_h3(self, text: str) -> None: + """ + Adds a h3 heading to the document. + + Args: + text: The text of the heading + """ + def render_to_html() -> str: + return '

' + escape_html(text) + '

' + + def render_to_latex() -> str: + return '\\subsubsection{' + latex.escape_text(text) + '}\n' + + self._doc.append([render_to_html, render_to_latex]) + + def add_equation(self, latex_equation: str, ref_id: str = '', ref_type: str = 'eq') -> None: + """ + Adds a LaTeX equation to the document. + + Args: + latex_equation: LaTeX formatted equation + ref_id: If provided, the equation is displayed with + a number and can be referenced by the ref_id + """ + caption = self._add_item(ref_id, ref_type, '({})') + + def render_to_html() -> str: + fig = latex_to_figure(latex_equation) + return ('
%s
' + '
%s
') % ( + figure_to_string(fig, self._figure_format, base64=self._base64_svgs), + caption) + + def render_to_latex() -> str: + if ref_id: + return '\\begin{equation}\\label{%s:%s}%s\\end{equation}' % ( + ref_type, ref_id, latex_equation) + else: + return '\\[%s\\]' % latex_equation + + self._doc.append([render_to_html, render_to_latex]) + + def add_markdown(self, text: str, section_class: str = '') -> None: + """ + Adds a markdown formatted text to the document. + + Args: + text: The markdown text to add + section_class: The class for the text section + """ + norm_text = _normalize_text_indent(str(text)) + + def render_to_html() -> str: + html = _markdown_to_html(norm_text) + if section_class: + return '
' + html + '
' + else: + return html + + def render_to_latex() -> str: + return latex.from_html(render_to_html()) + + self._doc.append([render_to_html, render_to_latex]) + + def _render_doc(self, doc_type: int) -> str: + fields = {k: f.to_html() for k, f in self._fields.items()} + return _fillin_fields(''.join(el[doc_type]() for el in self._doc), fields) + + def to_html(self, figure_format: FFormat = 'svg', + base64_svgs: bool = False, figure_scale: float = 1) -> str: + """ + Export the document to HTML. Figures will bew embedded in the HTML code. + The format can be selected between png in base64, inline svg or svg in base64. + + Args: + figure_format: The format for embedding the figures in the HTML code (svg or png) + base64_svgs: Whether to encode svg images in base64 + + Returns: + The HTML code + """ + self._figure_format = figure_format + self._base64_svgs = base64_svgs + self._fig_scale = figure_scale + + return _fillin_reference_names(self._render_doc(HTML_OUTPUT), self._item_index) + + def to_latex(self, font_family: Literal[None, 'serif', 'sans-serif'] = None, + table_renderer: TRenderer = 'simple', figure_scale: float = 1) -> str: + """ + Export the document to LaTeX. Figures will be embedded as pgf graphics. + + Args: + font_family: Overwrites the front family for figures + table_renderer: The renderer for tables (simple: renderer with column type + guessing for text and numbers; pandas: using the internal pandas LaTeX renderer) + + Returns: + The LaTeX code + """ + self._font_family = font_family + assert table_renderer in ['simple', 'pandas'], "table_renderer must be 'simple' or 'pandas'" + self._table_renderer = table_renderer + self._fig_scale = figure_scale + + return _check_latex_references(self._render_doc(LATEX_OUTPUT), self._item_index) + + def to_pdf(self, file_path: str, + font_family: Literal[None, 'serif', 'sans-serif'] = None, + table_renderer: TRenderer = 'simple', + latex_template_path: str = '') -> bool: + """ + Export the document to a PDF file using LaTeX. + + Args: + file_path: The path to save the PDF file to + font_family: Overwrites the front family for figures and the template + latex_template_path: Path to a LaTeX template file. The + expression will be replaced by the generated content. + If no path is provided a default template is used. + + Returns: + True if the PDF was successfully created + """ + latex_code = inject_to_template(self.to_latex(font_family, table_renderer), + latex_template_path, + 'templates/default_template.tex') + + if font_family == 'sans-serif': + latex_code = latex.inject_latex_command(latex_code, '\\renewcommand{\\familydefault}{\\sfdefault}') + success, errors, warnings = latex.compile(latex_code, file_path) + + if not success: + print('Errors:') + print('\n'.join(errors)) + print('Warnings:') + print('\n'.join(warnings)) + + return success + + def _repr_html_(self) -> str: + return self.to_html() + + def __repr__(self) -> str: + return self.to_html() diff --git a/src/pyladoc/latex.py b/src/pyladoc/latex.py new file mode 100644 index 0000000..c025332 --- /dev/null +++ b/src/pyladoc/latex.py @@ -0,0 +1,346 @@ +import bs4 +from html.parser import HTMLParser +from typing import Iterator, Generator, Any +from pandas.io.formats.style import Styler +import re +import os +import shutil +import subprocess +import tempfile +from .latex_escaping import unicode_to_latex_dict, latex_escape_dict + + +def basic_formatter(value: Any) -> str: + return escape_text(str(value)) + + +def to_ascii(text: str) -> str: + """ + Replaces/escapes often used unicode characters in latex code or text + with its LaTex ascii equivalents. + + Args: + text: The text to convert. + + Returns: + The escaped text. + """ + regex_filter = ('|'.join(unicode_to_latex_dict)) + + last_s = 0 + ret: list[str] = [] + for m in re.finditer(regex_filter, text): + s1, s2 = m.span() + ret.append(text[last_s:s1]) + ret.append(unicode_to_latex_dict[m.group()]) + last_s = s2 + ret.append(text[last_s:]) + + return ''.join(ret) + + +def normalize_label_text(text: str) -> str: + """ + Replace any special non-allowed character in the lable text. + + Args: + text: Input text + + Returns: + Normalized text + """ + return re.sub(r"[^a-zA-Z0-9.:]", '-', text) + + +def escape_text(text: str) -> str: + """ + Escapes special LaTeX characters and often used unicode characters in a given string. + + Args: + text: The text to escape + + Returns: + Escaped text + """ + + latex_translation = latex_escape_dict | unicode_to_latex_dict + + regex_filter = '|'.join(latex_translation) + + last_s = 0 + ret: list[str] = [] + for m in re.finditer(regex_filter, text): + s1, s2 = m.span() + ret.append(text[last_s:s1]) + matches = [v for k, v in latex_translation.items() if re.match(k, m.group())] + if m.group(1): + ret.append(matches[0].replace(r'\g<1>', normalize_label_text(m.group(1)))) + else: + ret.append(matches[0]) + last_s = s2 + ret.append(text[last_s:]) + + return ''.join(ret) + + +def render_pandas_styler_table(df_style: Styler, caption: str = '', label: str = '', centering: bool = True) -> str: + """ + Converts a pandas Styler object to LaTeX table. + + Args: + df_style: The pandas Styler object to convert. + caption: The caption for the table. + label: Label for referencing the table. + centering: Whether to center the table. + + Returns: + The LaTeX code. + """ + def iter_table(table: dict[str, Any]) -> Generator[str, None, None]: + yield '\\begin{table}\n' + if centering: + yield '\\centering\n' + + # Guess column type + numeric = re.compile(r'^[<>]?\s*(?:\d+,?)+(?:\.\d+)?(?:\s\D.*)?$') + formats = ['S' if all( + (numeric.match(line[ci]['display_value'].strip()) for line in table['body']) + ) else 'l' for ci in range(len(table['body'][0])) if table['body'][0][ci]['is_visible']] + + if caption: + yield f"\\caption{{{escape_text(caption)}}}\n" + if label: + yield f"\\label{{{normalize_label_text(label)}}}\n" + yield f"\\begin{{tabular}}{{{''.join(formats)}}}\n\\toprule\n" + + for head in table['head']: + yield (' & '.join(f"\\text{{{escape_text(c['display_value'].strip())}}}" + for c in head if c['is_visible'])) + yield ' \\\\\n' + + yield '\\midrule\n' + + for body in table['body']: + yield (' & '.join(escape_text(c['display_value'].strip()) + for c in body if c['is_visible'])) + yield ' \\\\\n' + + yield '\\bottomrule\n\\end{tabular}\n\\end{table}' + + str_list = iter_table(df_style._translate(False, False, blank='')) # type: ignore[attr-defined] + + return ''.join(str_list) + + +def from_html_old(html_code: str) -> str: + """ + Converts HTML code to LaTeX code. + + Args: + html_code: The HTML code to convert. + + Returns: + The LaTeX code. + """ + root = bs4.BeautifulSoup(html_code, 'html.parser') + + html_to_latex = { + 'strong': ('\\textbf{', '}'), + 'b': ('\\textbf{', '}'), + 'em': ('\\emph{', '}'), + 'i': ('\\emph{', '}'), + 'p': ('', '\n\n'), + 'h1': ('\\section{', '}'), + 'h2': ('\\subsection{', '}'), + 'h3': ('\\subsubsection{', '}'), + 'ul': ('\\begin{itemize}', '\\end{itemize}'), + 'ol': ('\\begin{enumerate}', '\\end{enumerate}'), + 'li': ('\\item ', ''), + 'latex_eq': ('\\[', '\\]'), + } + + def handle_table(table: bs4.element.Tag) -> str: + rows = table.find_all('tr') + latex_table: str = '' + for row in rows: + assert isinstance(row, bs4.element.Tag), 'HTML table not valid' + cells = row.find_all(['th', 'td']) + if not latex_table: + latex_table = "\\begin{tabular}{|" + "|".join(['l'] * len(cells)) + "|}\\toprule\n" + else: + latex_table += " & ".join(escape_text(cell.get_text(strip=True)) for cell in cells) + " \\\\\n" + latex_table += "\\bottomrule\n\\end{tabular}" + return latex_table + + def parse_node(element: bs4.element.Tag) -> Iterator[str]: + prefix, post = html_to_latex.get(element.name, ('', '')) + yield prefix + + for c in element.children: + if isinstance(c, bs4.element.Tag): + if c.name == 'table': + yield handle_table(c) + else: + yield from parse_node(c) + else: + yield escape_text(c.text) + yield post + + return ''.join(parse_node(root)) + + +def from_html(html_code: str) -> str: + """ + Converts HTML code to LaTeX code using HTMLParser. + + Args: + html_code: The HTML code to convert. + + Returns: + The LaTeX code. + """ + html_to_latex = { + 'strong': ('\\textbf{', '}'), + 'b': ('\\textbf{', '}'), + 'em': ('\\emph{', '}'), + 'i': ('\\emph{', '}'), + 'p': ('', '\n\n'), + 'h1': ('\\section{', '}\n'), + 'h2': ('\\subsection{', '}\n'), + 'h3': ('\\subsubsection{', '}\n'), + 'ul': ('\\begin{itemize}\n', '\\end{itemize}\n'), + 'ol': ('\\begin{enumerate}\n', '\\end{enumerate}\n'), + 'li': ('\\item ', '\n') + } + + class LaTeXHTMLParser(HTMLParser): + def __init__(self) -> None: + super().__init__() + self.latex_code: list[str] = [] + self.header_index: int = -1 + self.column_alignment = '' + self.midrule_flag = False + self.header_flag = False + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag in html_to_latex: + prefix, _ = html_to_latex[tag] + self.latex_code.append(prefix) + elif tag == 'table': + self.header_index = len(self.latex_code) + self.latex_code.append('') # Placeholder for column header + self.midrule_flag = False + self.header_flag = False + elif tag == 'tr': + self.column_alignment = '' + elif tag in ['th', 'td']: + style = [v for k, v in attrs if k == 'style'] + if style and style[0] and 'right' in style[0]: + self.column_alignment += 'r' + else: + self.column_alignment += 'l' + elif tag == 'a': + href = [v for k, v in attrs if k == 'href'] + assert href, 'Link href attribute is missing' + self.latex_code.append(f"\\href{{{href[0]}}}{{") + + def handle_endtag(self, tag: str) -> None: + if tag in html_to_latex: + _, postfix = html_to_latex[tag] + self.latex_code.append(postfix) + elif tag == 'table': + self.latex_code.append("\\bottomrule\n\\end{tabular}\n") + elif tag == 'tr': + self.latex_code.pop() # Remove column separator after last entry + if self.header_index >= 0: + self.latex_code[self.header_index] = f"\\begin{{tabular}}{{{self.column_alignment}}}\\toprule\n" + self.header_index = -1 + self.latex_code.append(' \\\\\n') + if self.header_flag and not self.midrule_flag: + self.latex_code.append("\\midrule\n") + self.midrule_flag = True + elif tag == 'th': + self.latex_code.append(" & ") + self.header_flag = True + elif tag == 'td': + self.latex_code.append(" & ") + elif tag == 'a': + self.latex_code.append("}") + + def handle_data(self, data: str) -> None: + if data.strip(): + self.latex_code.append(escape_text(data)) + + parser = LaTeXHTMLParser() + parser.feed(html_code) + return ''.join(parser.latex_code) + + +def compile(latex_code: str, output_file: str = '', encoding: str = 'utf-8') -> tuple[bool, list[str], list[str]]: + """ + Compiles LaTeX code to a PDF file. + + Args: + latex_code: The LaTeX code to compile. + output_file: The output file path. + encoding: The encoding of the LaTeX code. + + Returns: + A tuple with three elements: + - A boolean indicating whether the compilation was successful. + - A list of errors. + - A list of warnings. + """ + + with tempfile.TemporaryDirectory() as tmp_path: + command = ['pdflatex', '-halt-on-error', '--output-directory', tmp_path] + + errors: list[str] = [] + warnings: list[str] = [] + + for i in range(1, 4): + rerun_flag = False + error_flag = False + process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + output, error = process.communicate(input=latex_code.encode(encoding)) + + assert not error, 'Compilation error: ' + output.decode(encoding) + + for line in output.decode(encoding).split('\n'): + if 'Warning' in line: + warnings.append(f"Run {i}: " + line) + if 'reference' in line: + rerun_flag = True + if line.startswith('!') or line.startswith('*!'): + error_flag = True + + if error_flag: + errors.append(line) + + if not rerun_flag or errors: + break + + # Copy pdf file + file_list = [f for f in os.listdir(tmp_path) if f.lower().endswith('.pdf')] + if file_list: + pdf_file = os.path.join(tmp_path, file_list[0]) + if output_file: + shutil.copyfile(pdf_file, output_file) + + return not errors, errors, warnings + + +def inject_latex_command(text: str, command: str) -> str: + lines = text.splitlines() + + last_package_index = -1 + for i, line in enumerate(lines): + if line.strip().startswith("\\usepackage"): + last_package_index = i + + if last_package_index != -1: + lines.insert(last_package_index + 1, f"\n{command}\n") + else: + lines.append(f"\n{command}\n") + + return '\n'.join(lines) diff --git a/src/pyladoc/latex_escaping.py b/src/pyladoc/latex_escaping.py new file mode 100644 index 0000000..f932561 --- /dev/null +++ b/src/pyladoc/latex_escaping.py @@ -0,0 +1,89 @@ +unicode_to_latex_dict = { + # Unicode numeric subscripts + '₀': r'\textsubscript{0}', '₁': r'\textsubscript{1}', '₂': r'\textsubscript{2}', '₃': r'\textsubscript{3}', + '₄': r'\textsubscript{4}', '₅': r'\textsubscript{5}', '₆': r'\textsubscript{6}', '₇': r'\textsubscript{7}', + '₈': r'\textsubscript{8}', '₉': r'\textsubscript{9}', + # Unicode numeric superscripts + '⁰': r'\textsuperscript{0}', '¹': r'\textsuperscript{1}', '²': r'\textsuperscript{2}', '³': r'\textsuperscript{3}', + '⁴': r'\textsuperscript{4}', '⁵': r'\textsuperscript{5}', '⁶': r'\textsuperscript{6}', '⁷': r'\textsuperscript{7}', + '⁸': r'\textsuperscript{8}', '⁹': r'\textsuperscript{9}', '⁺': r'\textsuperscript{+}', '⁻': r'\textsuperscript{-}', + # Often used European non-ascii-characters + 'ä': r'{\"a}', + 'ö': r'{\"o}', + 'ü': r'{\"u}', + 'Ä': r'{\"A}', + 'Ö': r'{\"O}', + 'Ü': r'{\"U}', + 'ß': r'{\ss}', + 'é': r"{\'e}", + 'è': r"{\`e}", + 'ê': r"{\^e}", + 'à': r"{\`a}", + 'â': r"{\^a}", + 'ç': r"{\c{c}}", + 'É': r"{\'E}", + 'È': r"{\`E}", + 'Ê': r"{\^E}", + 'À': r"{\`A}", + 'Â': r"{\^A}", + 'Ç': r"{\c{C}}", + 'ó': r"{\'o}", + 'ò': r"{\`o}", + 'ô': r"{\^o}", + 'Ó': r"{\'O}", + 'Ò': r"{\`O}", + 'Ô': r"{\^O}", + 'í': r"{\'i}", + 'ì': r"{\`i}", + 'î': r"{\^i}", + 'Í': r"{\'I}", + 'Ì': r"{\`I}", + 'Î': r"{\^I}", + 'ú': r"{\'u}", + 'ù': r"{\`u}", + 'û': r"{\^u}", + 'Ú': r"{\'U}", + 'Ù': r"{\`U}", + 'Û': r"{\^U}", + 'å': r"{\r{a}}", + 'Å': r"{\r{A}}", + 'ø': r"{\o}", + 'Ø': r"{\O}", + 'æ': r"{\ae}", + 'Æ': r"{\AE}", + 'œ': r"{\oe}", + 'Œ': r"{\OE}", + # Other unicode + '°': r'{\textdegree}', + 'µ': r'{\textmu}', + 'π': r'$\pi$', + '≈': r'$\approx$', + '±': r'$\pm$', + '≠': r'$\neq$', + '∆': r'$\Delta$', + 'Ω': r'$\Omega$', + 'Λ': r'$\Lambda$', + 'Σ': r'$\Sigma$', + # '€': r'{\euro}', + '£': r'{\pounds}', + '¥': r'{\yen}', + '\u00A0': r'~', # Non-breaking space + '\u2007': ' ' # Figure space +} + +latex_escape_dict = { + '&': r'\&', + '%': r'\%', + r'\$': r'\$', + '#': r'\#', + '_': r'\_', + '{': r'\{', + '}': r'\}', + '<': r'{\textless}', + '>': r'{\textgreater}', + '~': r'\textasciitilde{}', + r'\^': r'\textasciicircum{}', + r'\\': r'\textbackslash{}', + # References: + r'@(\w+:[\w\_\-]+)': r'\ref{\g<1>}' +} diff --git a/src/pyladoc/templates/default_template.tex b/src/pyladoc/templates/default_template.tex new file mode 100644 index 0000000..a37d64a --- /dev/null +++ b/src/pyladoc/templates/default_template.tex @@ -0,0 +1,38 @@ +\documentclass[a4paper,12pt]{article} + +% Packages +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{lmodern} % Load Latin Modern font +\usepackage{graphicx} % For including images +\usepackage{amsmath} % For mathematical symbols +\usepackage{amssymb} % For additional symbols +\usepackage{hyperref} % For hyperlinks +\usepackage{caption} % For customizing captions +\usepackage{geometry} % To set margins +\usepackage{natbib} % For citations +\usepackage{float} % For fixing figure positions +\usepackage{siunitx} % For scientific units +\usepackage{booktabs} % For professional-looking tables +\usepackage{pgf} % For using pgf grafics +\usepackage{textcomp, gensymb} % provides \degree symbol + +\sisetup{ + table-align-text-post = false +} + +% Geometry Settings +\geometry{margin=1in} % 1-inch margins + +% Title and Author Information +% \title{Report Title} +% \author{Your Name \\ Department of XYZ \\ \texttt{email@example.com}} +% \date{\today} + +\begin{document} + +% Title Page +% # \maketitle + +% +\end{document} \ No newline at end of file diff --git a/src/pyladoc/templates/test_template.html b/src/pyladoc/templates/test_template.html new file mode 100644 index 0000000..9cafb1d --- /dev/null +++ b/src/pyladoc/templates/test_template.html @@ -0,0 +1,114 @@ + + + + + + Test template + + + +
+ +
+ \ No newline at end of file diff --git a/tests/document_validation.py b/tests/document_validation.py new file mode 100644 index 0000000..fe9eb80 --- /dev/null +++ b/tests/document_validation.py @@ -0,0 +1,64 @@ +from typing import Generator, Any +from lxml import etree +from lxml.etree import _Element as EElement # type: ignore +import requests + + +with open('src/pyladoc/templates/test_template.html', mode='rt', encoding='utf-8') as f: + html_test_template = f.read() + + +def add_line_numbers(multiline_string: str) -> str: + lines = multiline_string.splitlines() + numbered_lines = [f"{i + 1}: {line}" for i, line in enumerate(lines)] + return "\n".join(numbered_lines) + + +def validate_html_with_w3c(html_string: str) -> dict[str, Any]: + validator_url = "https://validator.w3.org/nu/" + + # Parameters for the POST request + headers = { + "Content-Type": "text/html; charset=utf-8", + "User-Agent": "Python HTML Validator"} + + try: + response = requests.post(validator_url, headers=headers, data=html_string, params={"out": "json"}) + + if response.status_code == 200: + return response.json() + else: + return { + "error": f"Failed to validate HTML. Status code: {response.status_code}", + "details": response.text + } + + except requests.RequestException as e: + return {"error": f"An error occurred while connecting to the W3C Validator: {str(e)}"} + + +def validate_html(html_string: str, validate_online: bool = False, check_for: list['str'] = ['table', 'svg', 'div']): + root = etree.fromstring(html_string, parser=etree.HTMLParser(recover=True)) + + def recursive_search(element: EElement) -> Generator[str, None, None]: + if isinstance(element.tag, str): + yield element.tag + + for child in element: + yield from recursive_search(child) + + tags = set(recursive_search(root)) + + for tag_type in check_for: + assert tag_type in tags, f"Tag {tag_type} not found in the html code" + + if validate_online: + test_page = html_test_template.replace('', html_string) + validation_result = validate_html_with_w3c(test_page) + assert 'messages' in validation_result, 'Validate request failed' + if validation_result['messages']: + print(add_line_numbers(test_page)) + for verr in validation_result['messages']: + print(f"- {verr['type']}: {verr['message']} (line: {verr['lastLine']})") + + assert len(validation_result['messages']) == 0, f'{len(validation_result["messages"])} validation error, first error: {validation_result["messages"][0]["message"]}' diff --git a/tests/out/test_html_render.html b/tests/out/test_html_render.html new file mode 100644 index 0000000..6ad2b1a --- /dev/null +++ b/tests/out/test_html_render.html @@ -0,0 +1,1525 @@ + + + + + + Test template + + + +

Special characters

+

ö ä ü Ö Ä Ü ß @ ∆

+

π ≈ ± ∆ Σ

+

£ ¥ $ €

+

Œ

+

Link

+

This is a hyperlink: nonan.net

+

Table

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Anz.TypBeschreibung
12BK9050Buskoppler
2KL11044 Digitaleingänge
2KL24044 Digitalausgänge (0,5 A)
3KL24244 Digitalausgänge (2 A)
2KL40044 Analogausgänge
1KL40022 Analogausgänge
22KL9188Potenzialverteilungsklemme
1KL9100Potenzialeinspeiseklemme
3KL30544 Analogeingänge
5KL3214PT100 4 Temperatureingänge (3-Leiter)
3KL3202PT100 2 Temperatureingänge (3-Leiter)
1KL24044 Digitalausgänge
2KL9010Endklemme
+
+

Equations

+

This line represents a reference to the equation 1.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
(1)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Figure 1: Bar chart with individual bar colors
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Table 1: This is a example table
Row1Row2Row3Row4Row5Row6Row7
Line112012 g/km5 stars3.5000001850 kg600 Nm
Line295 km/h> 150 g/km4 stars7.8000001500 kg250 Nm
Line3110110 g/km5 stars8.5000001400 kg280 Nm
Line4105 km/h1140 g/km4.5 stars6.9000001600 kg320 Nm
Line513013.05 g/km5 stars4.2000001700 kg450 Nm
+ +
+ \ No newline at end of file diff --git a/tests/out/test_latex_render.pdf b/tests/out/test_latex_render.pdf new file mode 100644 index 0000000..39dbd38 Binary files /dev/null and b/tests/out/test_latex_render.pdf differ diff --git a/tests/out/test_markdown_characters.html b/tests/out/test_markdown_characters.html new file mode 100644 index 0000000..29796ca --- /dev/null +++ b/tests/out/test_markdown_characters.html @@ -0,0 +1,5 @@ +

Special caracters

+

Umlaute: ÖÄÜ öäü

+

Other: ß, €, @, $, %, ~, µ

+

Units: m³, cm²

+

Controll characters: <, >, ", ', &, |, /, \

\ No newline at end of file diff --git a/tests/out/test_markdown_equations.html b/tests/out/test_markdown_equations.html new file mode 100644 index 0000000..6286051 --- /dev/null +++ b/tests/out/test_markdown_equations.html @@ -0,0 +1,39 @@ +

Source Equations

+
    +
  1. $4(3x + 2) - 5(x - 1) = 3x + 14$
  2. +
  3. $ +rac{2y + 5}{4} + +rac{3y - 1}{2} = 5$
  4. +
  5. $ +rac{5}{x + 2} + +rac{2}{x - 2} = 3$
  6. +
  7. $8(3b - 5) + 4(b + 2) = 60$
  8. +
  9. $2c^2 - 3c - 5 = 0$
  10. +
  11. $4(2d - 1) + 5(3d + 2) = 7d + 28$
  12. +
  13. $q^2 + 6q + 9 = 16$
  14. +
+

Result Equations

+
    +
  1. $x = +rac{1}{4}$
  2. +
  3. $y = +rac{17}{8}$
  4. +
  5. $z = +rac{7}{3}$
  6. +
  7. $x = 1$ or $x = -6$
  8. +
  9. $a = +rac{1}{3}$ or $a = 2$
  10. +
  11. $x = - +rac{2}{3}$ or $x = 3$
  12. +
  13. $b = +rac{23}{7}$
  14. +
+

Step by Step

+
    +
  1. Distribute: $12x + 8 - 5x + 5 = 3x + 14$
  2. +
  3. Combine like terms: $7x + 13 = 3x + 14$
  4. +
  5. Subtract $3x$: $4x + 13 = 14$
  6. +
  7. Subtract $13$: $4x = 1$
  8. +
  9. Divide by $4$: $x = +rac{1}{4}$
  10. +
\ No newline at end of file diff --git a/tests/out/test_markdown_style.html b/tests/out/test_markdown_style.html new file mode 100644 index 0000000..2afc94c --- /dev/null +++ b/tests/out/test_markdown_style.html @@ -0,0 +1,44 @@ +

Below is an in-depth explanation of the AArch64 (ARM64) +unconditional branch instruction—often simply called the +“B” instruction—and how its 26‐bit immediate field (imm26) +is laid out and later relocated during linking.

+
+

Instruction Layout

+

The unconditional branch in AArch64 is encoded in a 32‑bit +instruction. Its layout is as follows:

+
Bits:  31         26 25                           0
+        +-------------+------------------------------+
+        |  Opcode     |          imm26               |
+        +-------------+------------------------------+
+
+
    +
  • Opcode (bits 31:26):
  • +
  • For a plain branch (B), the opcode is 000101.
  • +
  • +

    For a branch with link (BL), which saves the return +address (i.e., a call), the opcode is 100101. +These 6 bits determine the instruction type.

    +
  • +
  • +

    Immediate Field (imm26, bits 25:0):

    +
  • +
  • This 26‑bit field holds a signed immediate value.
  • +
  • +

    Offset Calculation: At runtime, the processor:

    +
      +
    1. Shifts the 26‑bit immediate left by 2 bits. +(Because instructions are 4-byte aligned, +the two least-significant bits are always zero.)
    2. +
    3. Sign-extends the resulting 28‑bit value to +the full register width (typically 64 bits).
    4. +
    5. Adds this value to the program counter +(PC) to obtain the branch target.
    6. +
    +
  • +
  • +

    Reach:

    +
  • +
  • With a 26‑bit signed field that’s effectively 28 bits + after the shift, the branch can cover a range + of approximately ±128 MB from the current instruction.
  • +
\ No newline at end of file diff --git a/tests/out/test_markdown_table.html b/tests/out/test_markdown_table.html new file mode 100644 index 0000000..3499734 --- /dev/null +++ b/tests/out/test_markdown_table.html @@ -0,0 +1,77 @@ +

Klemmen

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Anz.TypBeschreibung
12BK9050Buskoppler
2KL11044 Digitaleingänge
2KL24044 Digitalausgänge (0,5 A)
3KL24244 Digitalausgänge (2 A)
2KL40044 Analogausgänge
1KL40022 Analogausgänge
22KL9188Potenzialverteilungsklemme
1KL9100Potenzialeinspeiseklemme
3KL30544 Analogeingänge
5KL3214PT100 4 Temperatureingänge (3-Leiter)
3KL3202PT100 2 Temperatureingänge (3-Leiter)
1KL24044 Digitalausgänge
2KL9010Endklemme
\ No newline at end of file diff --git a/tests/test_latex_tools.py b/tests/test_latex_tools.py new file mode 100644 index 0000000..89a6edd --- /dev/null +++ b/tests/test_latex_tools.py @@ -0,0 +1,151 @@ +import pyladoc.latex + + +def normalize_latex_code(latex_code: str) -> str: + return '\n'.join(line.strip() for line in latex_code.splitlines() if line) + + +def check_only_ascii(latex_code: str) -> bool: + return all(ord(c) < 128 for c in latex_code) + + +def test_latex_from_html(): + html_code = """ +

Test

+

This is are Umlautes: Ä,Ö and Ü

+

This is a test.

+

And this is another test.

+

And this is a third test.

+

And this is a fourth test.

+

This is a LaTeX command: \\textbf{test}

+

This are typical control characters: {, }, <, >, ", ', &, |, /, \\

+
    +
  • Item 1
  • +
  • Item 2
  • +
+ + + + + + + + + +
Header 1Header 2
Cell 1Cell 2
+ """ + + latex_code = pyladoc.latex.from_html(html_code) + + ref_latex_code = r""" + \section{Test} + This is are Umlautes: {\"A},{\"O} and {\"U} + This is a \textbf{test}. + And this is another \emph{test}. + And this is a \textbf{third} test. + And this is a \emph{fourth} test. + This is a LaTeX command: \textbackslash{}textbf\{test\} + This are typical control characters: \{, \}, {\textless}, {\textgreater}, ", ', \&, |, /, \textbackslash{} + \begin{itemize} + \item Item 1 + \item Item 2 + \end{itemize} + \begin{tabular}{ll}\toprule + Header 1 & Header 2 \\ + \midrule + Cell 1 & Cell 2 \\ + \bottomrule + \end{tabular}""" + + print(latex_code) + + print('--') + + # print(pyladoc.latex.escape_text(html_code)) + + assert check_only_ascii(latex_code), 'Some characters are not ASCII' + assert normalize_latex_code(ref_latex_code) == normalize_latex_code(latex_code) + + +def test_latex_from_markdown(): + markdown_code = """ + ## Test1 + + | Anz.| Typ | Beschreibung + |----:|----------|------------------------------------ + | 12 | BK9050 | Buskoppler + | 2 | KL1104 | 4 Digitaleingänge + | 2 | KL2404 | 4 Digitalausgänge (0,5 A) + | 3 | KL2424 | 4 Digitalausgänge (2 A) + | 2 | KL4004 | 4 Analogausgänge + | 1 | KL4002 | 2 Analogausgänge + | 22 | KL9188 | Potenzialverteilungsklemme + | 1 | KL9100 | Potenzialeinspeiseklemme + | 3 | KL3054 | 4 Analogeingänge + | 5 | KL3214 | PT100 4 Temperatureingänge (3-Leiter) + | 3 | KL3202 | PT100 2 Temperatureingänge (3-Leiter) + | 1 | KL2404 | 4 Digitalausgänge + | 2 | KL9010 | Endklemme + + This is a **test**. + + ## Test2 + + | Anz.| Beschreibung + |----:|------------------------------------ + | 12 | Buskoppler + | 2 | 4 Digitaleingänge + | 2 | 4 Digitalausgänge (0,5 A) + | 3 | 4 Digitalausgänge (2 A) + | 2 | 4 Analogausgänge + | 1 | 2 Analogausgänge + """ + + pyla = pyladoc.DocumentWriter() + pyla.add_markdown(markdown_code) + latex_code = pyladoc.latex.from_html(pyla.to_html()) + + ref_latex_code = r""" + \subsection{Test1} + \begin{tabular}{rll}\toprule + Anz. & Typ & Beschreibung \\ + \midrule + 12 & BK9050 & Buskoppler \\ + 2 & KL1104 & 4 Digitaleing{\"a}nge \\ + 2 & KL2404 & 4 Digitalausg{\"a}nge (0,5 A) \\ + 3 & KL2424 & 4 Digitalausg{\"a}nge (2 A) \\ + 2 & KL4004 & 4 Analogausg{\"a}nge \\ + 1 & KL4002 & 2 Analogausg{\"a}nge \\ + 22 & KL9188 & Potenzialverteilungsklemme \\ + 1 & KL9100 & Potenzialeinspeiseklemme \\ + 3 & KL3054 & 4 Analogeing{\"a}nge \\ + 5 & KL3214 & PT100 4 Temperatureing{\"a}nge (3-Leiter) \\ + 3 & KL3202 & PT100 2 Temperatureing{\"a}nge (3-Leiter) \\ + 1 & KL2404 & 4 Digitalausg{\"a}nge \\ + 2 & KL9010 & Endklemme \\ + \bottomrule + \end{tabular} + This is a \textbf{test}. + + \subsection{Test2} + \begin{tabular}{rl}\toprule + Anz. & Beschreibung \\ + \midrule + 12 & Buskoppler \\ + 2 & 4 Digitaleing{\"a}nge \\ + 2 & 4 Digitalausg{\"a}nge (0,5 A) \\ + 3 & 4 Digitalausg{\"a}nge (2 A) \\ + 2 & 4 Analogausg{\"a}nge \\ + 1 & 2 Analogausg{\"a}nge \\ + \bottomrule + \end{tabular}""" + + print(latex_code) + + assert check_only_ascii(latex_code), 'Some characters are not ASCII' + assert normalize_latex_code(ref_latex_code) == normalize_latex_code(latex_code) + + +if __name__ == '__main__': + test_latex_from_html() + test_latex_from_markdown() diff --git a/tests/test_rendering_example_doc.py b/tests/test_rendering_example_doc.py new file mode 100644 index 0000000..24ae886 --- /dev/null +++ b/tests/test_rendering_example_doc.py @@ -0,0 +1,108 @@ +import pyladoc +import matplotlib.pyplot as plt +import pandas as pd +import document_validation + +VALIDATE_HTML_CODE_ONLINE = False +WRITE_RESULT_FILES = True + + +def make_document(): + dw = pyladoc.DocumentWriter() + + dw.add_markdown(""" + # Special characters + + ö ä ü Ö Ä Ü ß @ ∆ + + π ≈ ± ∆ Σ + + £ ¥ $ € + + Œ + + # Link + + This is a hyperlink: [nonan.net](https://www.nonan.net) + + # Table + + | Anz.| Typ | Beschreibung + |----:|----------|------------------------------------ + | 12 | BK9050 | Buskoppler + | 2 | KL1104 | 4 Digitaleingänge + | 2 | KL2404 | 4 Digitalausgänge (0,5 A) + | 3 | KL2424 | 4 Digitalausgänge (2 A) + | 2 | KL4004 | 4 Analogausgänge + | 1 | KL4002 | 2 Analogausgänge + | 22 | KL9188 | Potenzialverteilungsklemme + | 1 | KL9100 | Potenzialeinspeiseklemme + | 3 | KL3054 | 4 Analogeingänge + | 5 | KL3214 | PT100 4 Temperatureingänge (3-Leiter) + | 3 | KL3202 | PT100 2 Temperatureingänge (3-Leiter) + | 1 | KL2404 | 4 Digitalausgänge + | 2 | KL9010 | Endklemme + + --- + + # Equations + + This line represents a reference to the equation @eq:test1. + """) + + dw.add_equation(r'y = a + b * \sum_{i=0}^{\infty} a_i x^i', 'test1') + + # Figure + fig, ax = plt.subplots() + + fruits = ['apple', 'blueberry', 'cherry', 'orange'] + counts = [40, 100, 30, 55] + bar_labels = ['red', 'blue', '_red', 'orange'] + bar_colors = ['tab:red', 'tab:blue', 'tab:red', 'tab:orange'] + + ax.bar(fruits, counts, label=bar_labels, color=bar_colors) + ax.set_ylabel('fruit supply') + ax.set_title('Fruit supply by kind and color') + ax.legend(title='Fruit color') + + dw.add_diagram(fig, 'Bar chart with individual bar colors') + + # Table + mydataset = { + 'Row1': ["Line1", "Line2", "Line3", "Line4", "Line5"], + 'Row2': [120, '95 km/h', 110, '105 km/h', 130], + 'Row3': ['12 g/km', '> 150 g/km', '110 g/km', '1140 g/km', '13.05 g/km'], + 'Row4': ['5 stars', '4 stars', '5 stars', '4.5 stars', '5 stars'], + 'Row5': [3.5, 7.8, 8.5, 6.9, 4.2], + 'Row6': ['1850 kg', '1500 kg', '1400 kg', '1600 kg', '1700 kg'], + 'Row7': ['600 Nm', '250 Nm', '280 Nm', '320 Nm', '450 Nm'] + } + df = pd.DataFrame(mydataset) + + dw.add_table(df.style.hide(axis="index"), 'This is a example table', 'example1') + + return dw + + +def test_html_render(): + doc = make_document() + html_code = doc.to_html() + + document_validation.validate_html(html_code, VALIDATE_HTML_CODE_ONLINE) + + if WRITE_RESULT_FILES: + with open('tests/out/test_html_render.html', 'w', encoding='utf-8') as f: + f.write(pyladoc.inject_to_template(html_code, internal_template='templates/test_template.html')) + + +def test_latex_render(): + doc = make_document() + + # print(doc.to_latex()) + + assert doc.to_pdf('tests/out/test_latex_render.pdf', font_family='serif') + + +if __name__ == '__main__': + test_html_render() + test_latex_render() diff --git a/tests/test_rendering_markdown.py b/tests/test_rendering_markdown.py new file mode 100644 index 0000000..a602582 --- /dev/null +++ b/tests/test_rendering_markdown.py @@ -0,0 +1,152 @@ +import pyladoc +import document_validation + +VALIDATE_HTML_CODE_ONLINE = False +WRITE_RESULT_FILES = True + + +def test_markdown_styling(): + pyla = pyladoc.DocumentWriter() + pyla.add_markdown( + """ + Below is an in-depth explanation of the AArch64 (ARM64) + unconditional branch instruction—often simply called the + “B” instruction—and how its 26‐bit immediate field (imm26) + is laid out and later relocated during linking. + + --- + + ## Instruction Layout + + The unconditional branch in AArch64 is encoded in a 32‑bit + instruction. Its layout is as follows: + + ``` + Bits: 31 26 25 0 + +-------------+------------------------------+ + | Opcode | imm26 | + +-------------+------------------------------+ + ``` + + - **Opcode (bits 31:26):** + - For a plain branch (`B`), the opcode is `000101`. + - For a branch with link (`BL`), which saves the return + address (i.e., a call), the opcode is `100101`. + These 6 bits determine the instruction type. + + - **Immediate Field (imm26, bits 25:0):** + - This 26‑bit field holds a signed immediate value. + - **Offset Calculation:** At runtime, the processor: + 1. **Shifts** the 26‑bit immediate left by 2 bits. + (Because instructions are 4-byte aligned, + the two least-significant bits are always zero.) + 2. **Sign-extends** the resulting 28‑bit value to + the full register width (typically 64 bits). + 3. **Adds** this value to the program counter + (PC) to obtain the branch target. + + - **Reach:** + - With a 26‑bit signed field that’s effectively 28 bits + after the shift, the branch can cover a range + of approximately ±128 MB from the current instruction. + """) + + html_code = pyla.to_html() + document_validation.validate_html(html_code, check_for=['strong', 'ol', 'li', 'code', 'hr']) + + if WRITE_RESULT_FILES: + with open('tests/out/test_markdown_style.html', 'w', encoding='utf-8') as f: + f.write(html_code) + + +def test_markdown_table(): + pyla = pyladoc.DocumentWriter() + pyla.add_markdown( + """ + ## Klemmen + + | Anz.| Typ | Beschreibung + |----:|----------|------------------------------------ + | 12 | BK9050 | Buskoppler + | 2 | KL1104 | 4 Digitaleingänge + | 2 | KL2404 | 4 Digitalausgänge (0,5 A) + | 3 | KL2424 | 4 Digitalausgänge (2 A) + | 2 | KL4004 | 4 Analogausgänge + | 1 | KL4002 | 2 Analogausgänge + | 22 | KL9188 | Potenzialverteilungsklemme + | 1 | KL9100 | Potenzialeinspeiseklemme + | 3 | KL3054 | 4 Analogeingänge + | 5 | KL3214 | PT100 4 Temperatureingänge (3-Leiter) + | 3 | KL3202 | PT100 2 Temperatureingänge (3-Leiter) + | 1 | KL2404 | 4 Digitalausgänge + | 2 | KL9010 | Endklemme + """) + + html_code = pyla.to_html() + document_validation.validate_html(html_code, check_for=['table']) + + if WRITE_RESULT_FILES: + with open('tests/out/test_markdown_table.html', 'w', encoding='utf-8') as f: + f.write(html_code) + + +def test_markdown_equations(): + pyla = pyladoc.DocumentWriter() + pyla.add_markdown( + """ + # Source Equations + 1. $4(3x + 2) - 5(x - 1) = 3x + 14$ + 2. $\frac{2y + 5}{4} + \frac{3y - 1}{2} = 5$ + 3. $\frac{5}{x + 2} + \frac{2}{x - 2} = 3$ + 4. $8(3b - 5) + 4(b + 2) = 60$ + 5. $2c^2 - 3c - 5 = 0$ + 6. $4(2d - 1) + 5(3d + 2) = 7d + 28$ + 7. $q^2 + 6q + 9 = 16$ + + # Result Equations + 1. $x = \frac{1}{4}$ + 2. $y = \frac{17}{8}$ + 3. $z = \frac{7}{3}$ + 4. $x = 1$ or $x = -6$ + 5. $a = \frac{1}{3}$ or $a = 2$ + 6. $x = -\frac{2}{3}$ or $x = 3$ + 7. $b = \frac{23}{7}$ + + # Step by Step + 1. Distribute: $12x + 8 - 5x + 5 = 3x + 14$ + 2. Combine like terms: $7x + 13 = 3x + 14$ + 3. Subtract $3x$: $4x + 13 = 14$ + 4. Subtract $13$: $4x = 1$ + 5. Divide by $4$: $x = \frac{1}{4}$ + """) + + html_code = pyla.to_html() + document_validation.validate_html(html_code, check_for=['h1']) + + if WRITE_RESULT_FILES: + with open('tests/out/test_markdown_equations.html', 'w', encoding='utf-8') as f: + f.write(html_code) + + +def test_markdown_characters(): + pyla = pyladoc.DocumentWriter() + pyla.add_markdown( + """ + # Special caracters + + Umlaute: ÖÄÜ öäü + + Other: ß, €, @, $, %, ~, µ + + Units: m³, cm² + + Controll characters: <, >, ", ', &, |, /, \\ + + """) + + html_code = pyla.to_html() + document_validation.validate_html(html_code, check_for=['h1']) + + if WRITE_RESULT_FILES: + with open('tests/out/test_markdown_characters.html', 'w', encoding='utf-8') as f: + f.write(html_code)