From 716c0cabcce07e6842db3e8c4cc8499ea8d1a960 Mon Sep 17 00:00:00 2001 From: Sebastian Schuberth Date: Thu, 28 Aug 2025 16:36:05 +0200 Subject: [PATCH 1/2] Fixup my own entry in `AUTHORS.rst` Something went wrong in the Git history as I never was at FSFE, but still my past commits 1b428ef3970c82f1ed34b08536381806e74f091d a0ebb35a69021b68d1245e346042a531beb9a903 were made under that address. Fixup the address to my real current one, and remove myself from the header of this file. --- AUTHORS.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 96fb3f5a..29d0fc86 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -1,6 +1,5 @@ .. SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. - SPDX-FileCopyrightText: 2017 Sebastian Schuberth SPDX-License-Identifier: CC-BY-SA-4.0 @@ -20,7 +19,7 @@ Contributors - Max Mehl - Nico Rikken - Florian Snow -- Sebastian Schuberth +- Sebastian Schuberth - Samuel Gaist - Diego Elio Pettenò - Matija Šuklje From 96e1263e78c093e2a7de6e86799e35bd42fd3249 Mon Sep 17 00:00:00 2001 From: Sebastian Schuberth Date: Thu, 28 Aug 2025 16:25:42 +0200 Subject: [PATCH 2/2] Round down decoded text to full lines This avoids cropped text in the middle of a line to trigger bogus "Bad licenses" errors. Fixes #793. --- changelog.d/fixed/round-down-to-full-lines.md | 4 ++++ src/reuse/extract.py | 12 +++++++++++- tests/test_extract.py | 9 +++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 changelog.d/fixed/round-down-to-full-lines.md diff --git a/changelog.d/fixed/round-down-to-full-lines.md b/changelog.d/fixed/round-down-to-full-lines.md new file mode 100644 index 00000000..d393ea32 --- /dev/null +++ b/changelog.d/fixed/round-down-to-full-lines.md @@ -0,0 +1,4 @@ +- Text extraction is limited to a certain number of bytes, which could happen to + cut a text line in between a license identifier, triggering bogus "Bad + licenses" errors. This is now avoided by "rounding down" extracted text to + full lines of text. diff --git a/src/reuse/extract.py b/src/reuse/extract.py index 34bcd0fa..f8835efc 100644 --- a/src/reuse/extract.py +++ b/src/reuse/extract.py @@ -8,6 +8,7 @@ # SPDX-FileCopyrightText: 2023 Johannes Zarl-Zierl # SPDX-FileCopyrightText: 2024 Rivos Inc. # SPDX-FileCopyrightText: 2024 Skyler Grey +# SPDX-FileCopyrightText: 2025 Double Open Oy # SPDX-FileCopyrightText: © 2020 Liferay, Inc. # # SPDX-License-Identifier: GPL-3.0-or-later @@ -118,7 +119,16 @@ def decoded_text_from_binary( size = -1 rawdata = binary_file.read(size) result = rawdata.decode("utf-8", errors="replace") - return result.replace("\r\n", "\n") + norm_str = result.replace("\r\n", "\n") + if len(rawdata) != size: + return norm_str + # If we have read exactly *size* bytes, we might have cut off in the + # middle of a line. To avoid returning a partial line, we only return + # complete lines. + newline_pos = norm_str.rfind("\n") + if newline_pos in (-1, len(norm_str) - 1): + return norm_str + return norm_str[:newline_pos] def _contains_snippet(binary_file: BinaryIO) -> bool: diff --git a/tests/test_extract.py b/tests/test_extract.py index 46bcad68..5ce1e913 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,6 +4,7 @@ # SPDX-FileCopyrightText: 2022 Nico Rikken # SPDX-FileCopyrightText: 2022 Pietro Albini # SPDX-FileCopyrightText: 2024 Rivos Inc. +# SPDX-FileCopyrightText: 2025 Double Open Oy # SPDX-FileCopyrightText: © 2020 Liferay, Inc. # # SPDX-License-Identifier: GPL-3.0-or-later @@ -391,6 +392,14 @@ def test_decoded_text_from_binary_crlf(): assert decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld" +def test_decoded_text_from_binary_round_down(): + """Given a cropped line of text, only take full lines.""" + text = "Hello\nbeautiful\nworld" + encoded = text.encode("utf-8") + result = decoded_text_from_binary(BytesIO(encoded), size=len(encoded)) + assert result == "Hello\nbeautiful" + + def test_detect_line_endings_windows(): """Given a CRLF string, detect the line endings.""" assert detect_line_endings("hello\r\nworld") == "\r\n"