diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index 44a8648..efad792 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -64,6 +64,12 @@ entry: pretty-format-json language: python types: [json] +- id: check-html + name: check html + description: checks html files for parseable syntax. + entry: check-html + language: python + types: [html] - id: check-merge-conflict name: check for merge conflicts description: checks for files that contain merge conflict strings. diff --git a/README.md b/README.md index 97bfba6..0db3bf0 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,9 @@ Checks for a common error of placing code before the docstring. #### `check-executables-have-shebangs` Checks that non-binary executables have a proper shebang. +#### `check-html` +Attempts to load all HTML files to verify syntax + #### `check-illegal-windows-names` Check for files that cannot be created on Windows. diff --git a/pre_commit_hooks/check_html.py b/pre_commit_hooks/check_html.py new file mode 100644 index 0000000..9801a34 --- /dev/null +++ b/pre_commit_hooks/check_html.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import argparse +import collections +from html.parser import HTMLParser +from typing import Sequence + + +class ValidationException(Exception): + pass + + +class ValidatingHTMLParser(HTMLParser): + def __init__(self) -> None: + super(HTMLParser, self).__init__() + self.stack: collections.deque[str] = collections.deque() + + def handle_starttag( + self, tag: str, + attrs: list[tuple[str, str | None]], + ) -> None: + self.stack.append(tag) + + def handle_endtag(self, tag: str) -> None: + if not self.stack: + raise ValidationException(f"no opening tag for {tag}>") + opening_tag = self.stack.pop() + if opening_tag != tag: + stack = '/'.join(self.stack) + raise ValidationException( + f"attempt to close {opening_tag} with {tag} at /{stack}", + ) + + def handle_startendtag( + self, tag: str, + attrs: list[tuple[str, str | None]], + ) -> None: + # append and immediately pop stack + pass + + def close(self) -> None: + super().close() + if self.stack: + opening_tag = self.stack.pop() + stack = '/'.join(self.stack) + raise ValidationException( + f"EOF reached while {opening_tag} at /{stack} not closed", + ) + + +def main(argv: Sequence[str] | None = None) -> int: + parser = argparse.ArgumentParser() + parser.add_argument( + 'filenames', + nargs='*', + help='HTML filenames to check.', + ) + args = parser.parse_args(argv) + + retval = 0 + for filename in args.filenames: + try: + with open(filename, 'rb') as html_file: + html_parser = ValidatingHTMLParser() + html_parser.feed(html_file.read().decode('ascii', 'ignore')) + html_parser.close() + except ValidationException as exc: + print(f'{filename}: Failed to parse HTML: ({exc})') + retval = 1 + return retval + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/setup.cfg b/setup.cfg index 82a5457..0813555 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,6 +37,7 @@ console_scripts = check-case-conflict = pre_commit_hooks.check_case_conflict:main check-docstring-first = pre_commit_hooks.check_docstring_first:main check-executables-have-shebangs = pre_commit_hooks.check_executables_have_shebangs:main + check-html = pre_commit_hooks.check_html:main, check-json = pre_commit_hooks.check_json:main check-merge-conflict = pre_commit_hooks.check_merge_conflict:main check-shebang-scripts-are-executable = pre_commit_hooks.check_shebang_scripts_are_executable:main diff --git a/testing/resources/bad_html_not_closed.html b/testing/resources/bad_html_not_closed.html new file mode 100644 index 0000000..9044464 --- /dev/null +++ b/testing/resources/bad_html_not_closed.html @@ -0,0 +1 @@ +
This is my page
+ + diff --git a/tests/check_html_test.py b/tests/check_html_test.py new file mode 100644 index 0000000..719782c --- /dev/null +++ b/tests/check_html_test.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import pytest + +from pre_commit_hooks.check_html import main +from testing.util import get_resource_path + + +@pytest.mark.parametrize( + ('filename', 'expected_retval'), ( + ('bad_html_not_closed.html', 1), + ('bad_html_too_many_close.html', 1), + ('bad_html_wrong_close.html', 1), + ('ok_html_fragment.html', 0), + ('ok_html_page.html', 0), + ), +) +def test_main(filename, expected_retval): + ret = main([get_resource_path(filename)]) + assert ret == expected_retval