fsfe · sschuberth · Aug 28, 2025 · Aug 28, 2025
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -1,6 +1,5 @@
 ..
   SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
-  SPDX-FileCopyrightText: 2017 Sebastian Schuberth <[email protected]>
 
   SPDX-License-Identifier: CC-BY-SA-4.0
 
@@ -20,7 +19,7 @@ Contributors
 - Max Mehl <[email protected]>
 - Nico Rikken <[email protected]>
 - Florian Snow <[email protected]>
-- Sebastian Schuberth <schuberth@fsfe.org>
+- Sebastian Schuberth <sebastian@doubleopen.org>
 - Samuel Gaist <[email protected]>
 - Diego Elio Pettenò <[email protected]>
 - Matija Šuklje <[email protected]>

diff --git a/changelog.d/fixed/round-down-to-full-lines.md b/changelog.d/fixed/round-down-to-full-lines.md
@@ -0,0 +1,4 @@
+- Text extraction is limited to a certain number of bytes, which could happen to
+  cut a text line in between a license identifier, triggering bogus "Bad
+  licenses" errors. This is now avoided by "rounding down" extracted text to
+  full lines of text.
diff --git a/src/reuse/extract.py b/src/reuse/extract.py
@@ -8,6 +8,7 @@
 # SPDX-FileCopyrightText: 2023 Johannes Zarl-Zierl <[email protected]>
 # SPDX-FileCopyrightText: 2024 Rivos Inc.
 # SPDX-FileCopyrightText: 2024 Skyler Grey <[email protected]>
+# SPDX-FileCopyrightText: 2025 Double Open Oy
 # SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
@@ -118,7 +119,16 @@ def decoded_text_from_binary(
         size = -1
     rawdata = binary_file.read(size)
     result = rawdata.decode("utf-8", errors="replace")
-    return result.replace("\r\n", "\n")
+    norm_str = result.replace("\r\n", "\n")
+    if len(rawdata) != size:
+        return norm_str
+    # If we have read exactly *size* bytes, we might have cut off in the
+    # middle of a line. To avoid returning a partial line, we only return
+    # complete lines.
+    newline_pos = norm_str.rfind("\n")
+    if newline_pos in (-1, len(norm_str) - 1):
+        return norm_str
+    return norm_str[:newline_pos]
 
 
 def _contains_snippet(binary_file: BinaryIO) -> bool:

diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -4,6 +4,7 @@
 # SPDX-FileCopyrightText: 2022 Nico Rikken <[email protected]>
 # SPDX-FileCopyrightText: 2022 Pietro Albini <[email protected]>
 # SPDX-FileCopyrightText: 2024 Rivos Inc.
+# SPDX-FileCopyrightText: 2025 Double Open Oy
 # SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
@@ -391,6 +392,14 @@ def test_decoded_text_from_binary_crlf():
     assert decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld"
 
 
+def test_decoded_text_from_binary_round_down():
+    """Given a cropped line of text, only take full lines."""
+    text = "Hello\nbeautiful\nworld"
+    encoded = text.encode("utf-8")
+    result = decoded_text_from_binary(BytesIO(encoded), size=len(encoded))
+    assert result == "Hello\nbeautiful"
+
+
 def test_detect_line_endings_windows():
     """Given a CRLF string, detect the line endings."""
     assert detect_line_endings("hello\r\nworld") == "\r\n"