From 5ee061b810caf03025927f2e8fc7fb15fdcabd58 Mon Sep 17 00:00:00 2001
From: Ian Cordasco <graffatcolmingov@gmail.com>
Date: Tue, 23 Feb 2016 11:17:11 -0600
Subject: [PATCH] Add line splitting and file reading

Add some tests around reading lines and striping UTF BOMs
---
 flake8/checker.py               | 68 +++++++++++++++++++++++++--------
 tests/unit/test_file_checker.py | 26 +++++++++++++
 2 files changed, 79 insertions(+), 15 deletions(-)
 create mode 100644 tests/unit/test_file_checker.py

diff --git a/flake8/checker.py b/flake8/checker.py
index f400f1a..33e58d4 100644
--- a/flake8/checker.py
+++ b/flake8/checker.py
@@ -187,27 +187,65 @@ class FileChecker(object):
             return self.read_lines_from_stdin()
         return self.read_lines_from_filename()
 
-    def read_lines_from_stdin(self):
-        """Read the lines from standard in."""
-        return utils.stdin_get_value().splitlines(True)
+    def _readlines_py2(self):
+        with open(self.filename, 'rU') as fd:
+            return fd.readlines()
+
+    def _readlines_py3(self):
+        try:
+            with open(self.filename, 'rb') as fd:
+                (coding, lines) = tokenize.detect_encoding(fd.readline)
+                textfd = io.TextIOWrapper(fd, coding, line_buffering=True)
+                return ([l.decode(coding) for l in lines] +
+                        textfd.readlines())
+        except (LookupError, SyntaxError, UnicodeError):
+            # If we can't detect the codec with tokenize.detect_encoding, or
+            # the detected encoding is incorrect, just fallback to latin-1.
+            with open(self.filename, encoding='latin-1') as fd:
+                return fd.readlines()
 
     def read_lines_from_filename(self):
         """Read the lines for a file."""
         if (2, 6) <= sys.version_info < (3, 0):
-            with open(self.filename, 'rU') as fd:
-                return fd.readlines()
-
+            readlines = self._readlines_py2
         elif (3, 0) <= sys.version_info < (4, 0):
-            try:
-                with open(self.filename, 'rb') as fd:
-                    (coding, lines) = tokenize.detect_encoding(fd.readline)
-                    textfd = io.TextIOWrapper(fd, coding, line_buffering=True)
-                    return ([l.decode(coding) for l in lines] +
-                            textfd.readlines())
-            except (LookupError, SyntaxError, UnicodeError):
-                with open(self.filename, encoding='latin-1') as fd:
-                    return fd.readlines()
+            readlines = self._readlines_py3
+
+        try:
+            return readlines()
+        except IOError:
+            # If we can not read the file due to an IOError (e.g., the file
+            # does not exist or we do not have the permissions to open it)
+            # then we need to format that exception for the user.
+            # NOTE(sigmavirus24): Historically, pep8 has always reported this
+            # as an E902. We probably *want* a better error code for this
+            # going forward.
+            (exc_type, exception) = sys.exc_info()[:2]
+            message = '{0}: {1}'.format(exc_type.__name__, exception)
+            self.results.append('E902', self.filename, 0, 0, message)
+            return []
+
+    def read_lines_from_stdin(self):
+        """Read the lines from standard in."""
+        return utils.stdin_get_value().splitlines(True)
 
     def run_checks(self):
         """Run checks against the file."""
         self.lines = self.read_lines()
+        self.strip_utf_bom()
+
+    def strip_utf_bom(self):
+        """Strip the UTF bom from the lines of the file."""
+        if not self.lines:
+            # If we have nothing to analyze quit early
+            return
+
+        first_byte = ord(self.lines[0][0])
+        if first_byte not in (0xEF, 0xFEFF):
+            return
+
+        # If the first byte of the file is a UTF-8 BOM, strip it
+        if first_byte == 0xFEFF:
+            self.lines[0] = self.lines[0][1:]
+        elif self.lines[0][:3] == '\xEF\xBB\xBF':
+            self.lines[0] = self.lines[0][3:]
diff --git a/tests/unit/test_file_checker.py b/tests/unit/test_file_checker.py
new file mode 100644
index 0000000..e26bc83
--- /dev/null
+++ b/tests/unit/test_file_checker.py
@@ -0,0 +1,26 @@
+"""Tests for the FileChecker class."""
+from flake8 import checker
+
+import pytest
+
+
+def test_read_lines_splits_lines():
+    """Verify that read_lines splits the lines of the file."""
+    file_checker = checker.FileChecker(__file__, [])
+    lines = file_checker.read_lines()
+    assert len(lines) > 5
+    assert '"""Tests for the FileChecker class."""\n' in lines
+
+
+@pytest.mark.parametrize('first_line', [
+    '\xEF\xBB\xBF"""Module docstring."""\n',
+    '\uFEFF"""Module docstring."""\n',
+])
+def test_strip_utf_bom(first_line):
+    r"""Verify that we strip '\xEF\xBB\xBF' from the first line."""
+    lines = [first_line]
+    file_checker = checker.FileChecker('stdin', [])
+    file_checker.lines = lines[:]
+    file_checker.strip_utf_bom()
+    assert file_checker.lines != lines
+    assert file_checker.lines[0] == '"""Module docstring."""\n'