Check strings in jupyter notebooks

This commit is contained in:
cyyc1 2022-10-05 02:42:08 -07:00
parent 6b03546fc3
commit cf898e2ce2
7 changed files with 528 additions and 42 deletions

View file

@ -1,57 +1,17 @@
from __future__ import annotations
import argparse
import io
import re
import tokenize
from typing import Sequence
START_QUOTE_RE = re.compile('^[a-zA-Z]*"')
def handle_match(token_text: str) -> str:
if '"""' in token_text or "'''" in token_text:
return token_text
match = START_QUOTE_RE.match(token_text)
if match is not None:
meat = token_text[match.end():-1]
if '"' in meat or "'" in meat:
return token_text
else:
return match.group().replace('"', "'") + meat + "'"
else:
return token_text
def get_line_offsets_by_line_no(src: str) -> list[int]:
# Padded so we can index with line number
offsets = [-1, 0]
for line in src.splitlines(True):
offsets.append(offsets[-1] + len(line))
return offsets
from pre_commit_hooks.util_string_fixer import fix_strings_in_file_contents
def fix_strings(filename: str) -> int:
with open(filename, encoding='UTF-8', newline='') as f:
contents = f.read()
line_offsets = get_line_offsets_by_line_no(contents)
# Basically a mutable string
splitcontents = list(contents)
new_contents = fix_strings_in_file_contents(contents)
# Iterate in reverse so the offsets are always correct
tokens_l = list(tokenize.generate_tokens(io.StringIO(contents).readline))
tokens = reversed(tokens_l)
for token_type, token_text, (srow, scol), (erow, ecol), _ in tokens:
if token_type == tokenize.STRING:
new_text = handle_match(token_text)
splitcontents[
line_offsets[srow] + scol:
line_offsets[erow] + ecol
] = new_text
new_contents = ''.join(splitcontents)
if contents != new_contents:
with open(filename, 'w', encoding='UTF-8', newline='') as f:
f.write(new_contents)

View file

@ -0,0 +1,51 @@
from __future__ import annotations
import argparse
import json
from typing import Sequence
from pre_commit_hooks.util_string_fixer import fix_strings_in_file_contents
def fix_strings(filename: str) -> int:
with open(filename) as f:
notebook_contents = json.load(f)
cells = notebook_contents['cells']
return_value = 0
for cell in cells:
if cell['cell_type'] == 'code':
source_in_1_line = ''.join(cell['source'])
fixed = fix_strings_in_file_contents(source_in_1_line)
if fixed != source_in_1_line:
fixed_lines = fixed.split('\n')
cell['source'] = [_ + '\n' for _ in fixed_lines[:-1]] + [fixed_lines[-1]]
return_value = 1
if return_value == 1:
notebook_contents['cells'] = cells
with open(filename, 'w') as f:
json.dump(notebook_contents, f, indent=1)
f.write("\n") # because json.dump doesn't put \n at the end
return return_value
def main(argv: Sequence[str] | None = None) -> int:
parser = argparse.ArgumentParser()
parser.add_argument('filenames', nargs='*', help='Filenames to fix')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
return_value = fix_strings(filename)
if return_value != 0:
print(f'Fixing strings in {filename}')
retv |= return_value
return retv
if __name__ == '__main__':
raise SystemExit(main())

View file

@ -0,0 +1,52 @@
from __future__ import annotations
import io
import re
import tokenize
def handle_match(token_text: str) -> str:
if '"""' in token_text or "'''" in token_text:
return token_text
match = START_QUOTE_RE.match(token_text)
if match is not None:
meat = token_text[match.end():-1]
if '"' in meat or "'" in meat:
return token_text
else:
return match.group().replace('"', "'") + meat + "'"
else:
return token_text
def get_line_offsets_by_line_no(src: str) -> list[int]:
# Padded so we can index with line number
offsets = [-1, 0]
for line in src.splitlines(True):
offsets.append(offsets[-1] + len(line))
return offsets
def fix_strings_in_file_contents(contents: str) -> str:
line_offsets = get_line_offsets_by_line_no(contents)
# Basically a mutable string
splitcontents = list(contents)
# Iterate in reverse so the offsets are always correct
tokens_l = list(tokenize.generate_tokens(io.StringIO(contents).readline))
tokens = reversed(tokens_l)
for token_type, token_text, (srow, scol), (erow, ecol), _ in tokens:
if token_type == tokenize.STRING:
new_text = handle_match(token_text)
splitcontents[
line_offsets[srow] + scol:
line_offsets[erow] + ecol
] = new_text
new_contents = ''.join(splitcontents)
return new_contents
START_QUOTE_RE = re.compile('^[a-zA-Z]*"')