Check strings in jupyter notebooks

2026-07-02 15:39:35 +00:00 · 2022-10-05 02:42:08 -07:00 · 2022-10-05 02:42:08 -07:00 · cf898e2ce2
commit cf898e2ce2
parent 6b03546fc3
7 changed files with 528 additions and 42 deletions
--- a/pre_commit_hooks/string_fixer.py
+++ b/pre_commit_hooks/string_fixer.py
@ -1,57 +1,17 @@
 from __future__ import annotations

 import argparse
-import io
-import re
-import tokenize
 from typing import Sequence

-START_QUOTE_RE = re.compile('^[a-zA-Z]*"')
-
-
-def handle_match(token_text: str) -> str:
-    if '"""' in token_text or "'''" in token_text:
-        return token_text
-
-    match = START_QUOTE_RE.match(token_text)
-    if match is not None:
-        meat = token_text[match.end():-1]
-        if '"' in meat or "'" in meat:
-            return token_text
-        else:
-            return match.group().replace('"', "'") + meat + "'"
-    else:
-        return token_text
-
-
-def get_line_offsets_by_line_no(src: str) -> list[int]:
-    # Padded so we can index with line number
-    offsets = [-1, 0]
-    for line in src.splitlines(True):
-        offsets.append(offsets[-1] + len(line))
-    return offsets
+from pre_commit_hooks.util_string_fixer import fix_strings_in_file_contents


 def fix_strings(filename: str) -> int:
    with open(filename, encoding='UTF-8', newline='') as f:
        contents = f.read()
-    line_offsets = get_line_offsets_by_line_no(contents)

-    # Basically a mutable string
-    splitcontents = list(contents)
+    new_contents = fix_strings_in_file_contents(contents)

-    # Iterate in reverse so the offsets are always correct
-    tokens_l = list(tokenize.generate_tokens(io.StringIO(contents).readline))
-    tokens = reversed(tokens_l)
-    for token_type, token_text, (srow, scol), (erow, ecol), _ in tokens:
-        if token_type == tokenize.STRING:
-            new_text = handle_match(token_text)
-            splitcontents[
-                line_offsets[srow] + scol:
-                line_offsets[erow] + ecol
-            ] = new_text
-
-    new_contents = ''.join(splitcontents)
    if contents != new_contents:
        with open(filename, 'w', encoding='UTF-8', newline='') as f:
            f.write(new_contents)
--- a/pre_commit_hooks/string_fixer_for_jupyter_notebooks.py
+++ b/pre_commit_hooks/string_fixer_for_jupyter_notebooks.py
@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import argparse
+import json
+from typing import Sequence
+
+from pre_commit_hooks.util_string_fixer import fix_strings_in_file_contents
+
+
+def fix_strings(filename: str) -> int:
+    with open(filename) as f:
+        notebook_contents = json.load(f)
+
+    cells = notebook_contents['cells']
+    return_value = 0
+    for cell in cells:
+        if cell['cell_type'] == 'code':
+            source_in_1_line = ''.join(cell['source'])
+            fixed = fix_strings_in_file_contents(source_in_1_line)
+            if fixed != source_in_1_line:
+                fixed_lines = fixed.split('\n')
+                cell['source'] = [_ + '\n' for _ in fixed_lines[:-1]] + [fixed_lines[-1]]
+                return_value = 1
+
+    if return_value == 1:
+        notebook_contents['cells'] = cells
+        with open(filename, 'w') as f:
+            json.dump(notebook_contents, f, indent=1)
+            f.write("\n")  # because json.dump doesn't put \n at the end
+
+    return return_value
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+
+    retv = 0
+
+    for filename in args.filenames:
+        return_value = fix_strings(filename)
+        if return_value != 0:
+            print(f'Fixing strings in {filename}')
+        retv |= return_value
+
+    return retv
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())
--- a/pre_commit_hooks/util_string_fixer.py
+++ b/pre_commit_hooks/util_string_fixer.py
@ -0,0 +1,52 @@
+from __future__ import annotations
+
+import io
+import re
+import tokenize
+
+
+def handle_match(token_text: str) -> str:
+    if '"""' in token_text or "'''" in token_text:
+        return token_text
+
+    match = START_QUOTE_RE.match(token_text)
+    if match is not None:
+        meat = token_text[match.end():-1]
+        if '"' in meat or "'" in meat:
+            return token_text
+        else:
+            return match.group().replace('"', "'") + meat + "'"
+    else:
+        return token_text
+
+
+def get_line_offsets_by_line_no(src: str) -> list[int]:
+    # Padded so we can index with line number
+    offsets = [-1, 0]
+    for line in src.splitlines(True):
+        offsets.append(offsets[-1] + len(line))
+    return offsets
+
+
+def fix_strings_in_file_contents(contents: str) -> str:
+    line_offsets = get_line_offsets_by_line_no(contents)
+
+    # Basically a mutable string
+    splitcontents = list(contents)
+
+    # Iterate in reverse so the offsets are always correct
+    tokens_l = list(tokenize.generate_tokens(io.StringIO(contents).readline))
+    tokens = reversed(tokens_l)
+    for token_type, token_text, (srow, scol), (erow, ecol), _ in tokens:
+        if token_type == tokenize.STRING:
+            new_text = handle_match(token_text)
+            splitcontents[
+                line_offsets[srow] + scol:
+                line_offsets[erow] + ecol
+            ] = new_text
+
+    new_contents = ''.join(splitcontents)
+    return new_contents
+
+
+START_QUOTE_RE = re.compile('^[a-zA-Z]*"')