Add check for unicode replacement characters

This commit is contained in:
Ville Skyttä 2020-02-06 18:43:41 +02:00
parent a18c5af5d4
commit 9e417077fc
5 changed files with 55 additions and 0 deletions

View file

@ -22,6 +22,12 @@
entry: check-byte-order-marker
language: python
types: [text]
- id: check-unicode-replacement-char
name: Check for Unicode replacement character
description: Forbid files which have a UTF-8 Unicode replacement character
entry: check-unicode-replacement-char
language: python
types: [text]
- id: check-builtin-literals
name: Check builtin type constructor use
description: Require literal syntax when initializing empty or zero Python builtin types.

View file

@ -62,6 +62,9 @@ Checks for symlinks which do not point to anything.
#### `check-toml`
Attempts to load all TOML files to verify syntax.
#### `check-unicode-replacement-char`
Forbid files which have a UTF-8 Unicode replacement character.
#### `check-vcs-permalinks`
Ensures that links to vcs websites are permalinks.

View file

@ -0,0 +1,32 @@
import argparse
import fileinput
from typing import Optional
from typing import Sequence
def main(argv: Optional[Sequence[str]] = None) -> int:
parser = argparse.ArgumentParser()
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for line in fileinput.input(files=args.filenames, mode='rb'):
try:
col = line.index(b'\xEF\xBF\xBD')
except ValueError:
continue
retv = 1
# Not saying filename:line:col: because that kind of format is usually
# used for character offsets, and we have a byte offset which might be
# different, emphasize that.
print(
f'{fileinput.filename()}:{fileinput.lineno()}: '
f'UTF-8 Unicode replacement character at byte {col}'
)
return retv
if __name__ == '__main__':
exit(main())

View file

@ -34,6 +34,7 @@ console_scripts =
check-ast = pre_commit_hooks.check_ast:main
check-builtin-literals = pre_commit_hooks.check_builtin_literals:main
check-byte-order-marker = pre_commit_hooks.check_byte_order_marker:main
check-unicode-replacement-char = pre_commit_hooks.check_unicode_replacement_char:main
check-case-conflict = pre_commit_hooks.check_case_conflict:main
check-docstring-first = pre_commit_hooks.check_docstring_first:main
check-executables-have-shebangs = pre_commit_hooks.check_executables_have_shebangs:main

View file

@ -0,0 +1,13 @@
from pre_commit_hooks import check_unicode_replacement_char
def test_failure(tmpdir):
f = tmpdir.join('f.txt')
f.write_text(str(b'\x80abc', errors='replace'), encoding='utf-8')
assert check_unicode_replacement_char.main((f.strpath,)) == 1
def test_success(tmpdir):
f = tmpdir.join('f.txt')
f.write_text(str(b'\x80abc', errors='backslashreplace'), encoding='utf-8')
assert check_unicode_replacement_char.main((f.strpath,)) == 0