diff --git a/AUTHORS.rst b/AUTHORS.rst index 96fb3f5ad..29d0fc86c 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -1,6 +1,5 @@ .. SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. - SPDX-FileCopyrightText: 2017 Sebastian Schuberth SPDX-License-Identifier: CC-BY-SA-4.0 @@ -20,7 +19,7 @@ Contributors - Max Mehl - Nico Rikken - Florian Snow -- Sebastian Schuberth +- Sebastian Schuberth - Samuel Gaist - Diego Elio Pettenò - Matija Šuklje diff --git a/changelog.d/fixed/round-down-to-full-lines.md b/changelog.d/fixed/round-down-to-full-lines.md new file mode 100644 index 000000000..d393ea32c --- /dev/null +++ b/changelog.d/fixed/round-down-to-full-lines.md @@ -0,0 +1,4 @@ +- Text extraction is limited to a certain number of bytes, which could happen to + cut a text line in between a license identifier, triggering bogus "Bad + licenses" errors. This is now avoided by "rounding down" extracted text to + full lines of text. diff --git a/src/reuse/extract.py b/src/reuse/extract.py index 34bcd0faf..f8835efc1 100644 --- a/src/reuse/extract.py +++ b/src/reuse/extract.py @@ -8,6 +8,7 @@ # SPDX-FileCopyrightText: 2023 Johannes Zarl-Zierl # SPDX-FileCopyrightText: 2024 Rivos Inc. # SPDX-FileCopyrightText: 2024 Skyler Grey +# SPDX-FileCopyrightText: 2025 Double Open Oy # SPDX-FileCopyrightText: © 2020 Liferay, Inc. # # SPDX-License-Identifier: GPL-3.0-or-later @@ -118,7 +119,16 @@ def decoded_text_from_binary( size = -1 rawdata = binary_file.read(size) result = rawdata.decode("utf-8", errors="replace") - return result.replace("\r\n", "\n") + norm_str = result.replace("\r\n", "\n") + if len(rawdata) != size: + return norm_str + # If we have read exactly *size* bytes, we might have cut off in the + # middle of a line. To avoid returning a partial line, we only return + # complete lines. + newline_pos = norm_str.rfind("\n") + if newline_pos in (-1, len(norm_str) - 1): + return norm_str + return norm_str[:newline_pos] def _contains_snippet(binary_file: BinaryIO) -> bool: diff --git a/tests/test_extract.py b/tests/test_extract.py index 46bcad68e..5ce1e913c 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,6 +4,7 @@ # SPDX-FileCopyrightText: 2022 Nico Rikken # SPDX-FileCopyrightText: 2022 Pietro Albini # SPDX-FileCopyrightText: 2024 Rivos Inc. +# SPDX-FileCopyrightText: 2025 Double Open Oy # SPDX-FileCopyrightText: © 2020 Liferay, Inc. # # SPDX-License-Identifier: GPL-3.0-or-later @@ -391,6 +392,14 @@ def test_decoded_text_from_binary_crlf(): assert decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld" +def test_decoded_text_from_binary_round_down(): + """Given a cropped line of text, only take full lines.""" + text = "Hello\nbeautiful\nworld" + encoded = text.encode("utf-8") + result = decoded_text_from_binary(BytesIO(encoded), size=len(encoded)) + assert result == "Hello\nbeautiful" + + def test_detect_line_endings_windows(): """Given a CRLF string, detect the line endings.""" assert detect_line_endings("hello\r\nworld") == "\r\n"