Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
..
SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
SPDX-FileCopyrightText: 2017 Sebastian Schuberth <[email protected]>

SPDX-License-Identifier: CC-BY-SA-4.0

Expand All @@ -20,7 +19,7 @@ Contributors
- Max Mehl <[email protected]>
- Nico Rikken <[email protected]>
- Florian Snow <[email protected]>
- Sebastian Schuberth <schuberth@fsfe.org>
- Sebastian Schuberth <sebastian@doubleopen.org>
- Samuel Gaist <[email protected]>
- Diego Elio Pettenò <[email protected]>
- Matija Šuklje <[email protected]>
Expand Down
4 changes: 4 additions & 0 deletions changelog.d/fixed/round-down-to-full-lines.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- Text extraction is limited to a certain number of bytes, which could happen to
cut a text line in between a license identifier, triggering bogus "Bad
licenses" errors. This is now avoided by "rounding down" extracted text to
full lines of text.
12 changes: 11 additions & 1 deletion src/reuse/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# SPDX-FileCopyrightText: 2023 Johannes Zarl-Zierl <[email protected]>
# SPDX-FileCopyrightText: 2024 Rivos Inc.
# SPDX-FileCopyrightText: 2024 Skyler Grey <[email protected]>
# SPDX-FileCopyrightText: 2025 Double Open Oy
# SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
#
# SPDX-License-Identifier: GPL-3.0-or-later
Expand Down Expand Up @@ -118,7 +119,16 @@ def decoded_text_from_binary(
size = -1
rawdata = binary_file.read(size)
result = rawdata.decode("utf-8", errors="replace")
return result.replace("\r\n", "\n")
norm_str = result.replace("\r\n", "\n")
if len(rawdata) != size:
return norm_str
# If we have read exactly *size* bytes, we might have cut off in the
# middle of a line. To avoid returning a partial line, we only return
# complete lines.
newline_pos = norm_str.rfind("\n")
if newline_pos in (-1, len(norm_str) - 1):
return norm_str
return norm_str[:newline_pos]


def _contains_snippet(binary_file: BinaryIO) -> bool:
Expand Down
9 changes: 9 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# SPDX-FileCopyrightText: 2022 Nico Rikken <[email protected]>
# SPDX-FileCopyrightText: 2022 Pietro Albini <[email protected]>
# SPDX-FileCopyrightText: 2024 Rivos Inc.
# SPDX-FileCopyrightText: 2025 Double Open Oy
# SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
#
# SPDX-License-Identifier: GPL-3.0-or-later
Expand Down Expand Up @@ -391,6 +392,14 @@ def test_decoded_text_from_binary_crlf():
assert decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld"


def test_decoded_text_from_binary_round_down():
"""Given a cropped line of text, only take full lines."""
text = "Hello\nbeautiful\nworld"
encoded = text.encode("utf-8")
result = decoded_text_from_binary(BytesIO(encoded), size=len(encoded))
assert result == "Hello\nbeautiful"


def test_detect_line_endings_windows():
"""Given a CRLF string, detect the line endings."""
assert detect_line_endings("hello\r\nworld") == "\r\n"
Expand Down
Loading