diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ba41bd9de63..c8e7a8dff1b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -197,6 +197,12 @@ License detection: See https://github.com/nexB/scancode-toolkit/issues/3219 +- A bugfix has been added to the ``--unknown-licenses`` option where + we would crash when using this option without using ``--matched-text`` + option. This is now working correctly and also better tested. + + See https://github.com/nexB/scancode-toolkit/issues/3343 + v31.2.5 - 2023-04-21 ---------------------------------- diff --git a/src/licensedcode/match_unknown.py b/src/licensedcode/match_unknown.py index 1525c59c7bb..b83a62a01e0 100644 --- a/src/licensedcode/match_unknown.py +++ b/src/licensedcode/match_unknown.py @@ -23,6 +23,7 @@ TRACE = False if TRACE: + use_print = True import logging import sys @@ -31,6 +32,9 @@ def logger_debug(*args): return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) + if use_print: + logger_debug = print + logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) @@ -142,6 +146,10 @@ def match_unknowns( unknown_ngram_length=unknown_ngram_length, ) + # build match from merged matched ngrams + qspans = (Span(qstart, qend) for qstart, qend in matched_ngrams) + qspan = Span().union(*qspans) + if TRACE: tokens_by_tid = idx.tokens_by_tid @@ -149,15 +157,14 @@ def get_tokens(_toks): return (' '.join(tokens_by_tid[t] for t in _toks)) print('match_unknowns: matched_ngrams') - for qstart, qend, matched_toks in matched_ngrams: + + for qstart, qend in matched_ngrams: + _span = Span(qstart, qend) + _tokens = [query_tokens[qpos] for qpos in _span] print( ' ', 'qstart', qstart, 'qend', qend, - 'matched_toks', get_tokens(matched_toks)) - - # build match from merged matched ngrams - qspans = (Span(qstart, qend) for qstart, qend in matched_ngrams) - qspan = Span().union(*qspans) + 'matched_toks', get_tokens(_tokens)) if not qspan: return @@ -169,7 +176,8 @@ def get_tokens(_toks): match_len = len(qspan) if TRACE: - print('match_unknowns: matched_span:', get_tokens(matched_tokens)) + #print('match_unknowns: matched_span:', get_tokens(matched_tokens)) + print('match_unknowns: qspan, match_len, matched_span:', qspan, match_len, matched_tokens) # we use the query side to build the ispans ispan = Span(0, match_len) @@ -180,9 +188,8 @@ def get_tokens(_toks): try: match_start_line = line_by_pos[qspan.start] match_end_line = line_by_pos[qspan.end] - except: - print('empty span:', qspan) - raise + except Exception as e: + raise Exception('empty span:', qspan) from e text = ''.join(get_full_qspan_matched_text( match_qspan=qspan, diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 760bfebe041..36e4241c68d 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -2542,14 +2542,12 @@ def __attrs_post_init__(self, *args, **kwargs): self.identifier = f'license-detection-unknown-{self._unique_id}' self.license_expression = UNKNOWN_LICENSE_KEY - # note that this could be shared across rules as an optimization + #TODO: that this could be shared across rules as an optimization self.license_expression_object = self.licensing.parse(UNKNOWN_LICENSE_KEY) self.is_license_notice = True self.notes = 'Unknown license based on a composite of license words.' self.is_synthetic = True self.setup() - # called only for it's side effects - self.tokens() @attr.s(slots=True, repr=False) diff --git a/tests/licensedcode/data/match_unknown/apache-2.0.LICENSE b/tests/licensedcode/data/match_unknown/apache-2.0.LICENSE new file mode 100644 index 00000000000..56478ebf2ce --- /dev/null +++ b/tests/licensedcode/data/match_unknown/apache-2.0.LICENSE @@ -0,0 +1,32 @@ +--- +key: apache-2.0 +short_name: Apache 2.0 +name: Apache License 2.0 +category: Permissive +owner: Apache Software Foundation +homepage_url: http://www.apache.org/licenses/ +spdx_license_key: Apache-2.0 +--- + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/tests/licensedcode/data/match_unknown/index_mini/licenses/gpl-2.0-plus.LICENSE b/tests/licensedcode/data/match_unknown/index_mini/licenses/gpl-2.0-plus.LICENSE new file mode 100644 index 00000000000..4646985d4c9 --- /dev/null +++ b/tests/licensedcode/data/match_unknown/index_mini/licenses/gpl-2.0-plus.LICENSE @@ -0,0 +1,55 @@ +--- +key: gpl-2.0-plus +short_name: GPL 2.0 or later +name: GNU General Public License 2.0 or later +category: Copyleft +owner: Free Software Foundation (FSF) +homepage_url: http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html +spdx_license_key: GPL-2.0-or-later +--- + + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + + +BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + diff --git a/tests/licensedcode/data/match_unknown/index_mini/rules/gpl-2.0-plus_1.RULE b/tests/licensedcode/data/match_unknown/index_mini/rules/gpl-2.0-plus_1.RULE new file mode 100644 index 00000000000..258a3a44099 --- /dev/null +++ b/tests/licensedcode/data/match_unknown/index_mini/rules/gpl-2.0-plus_1.RULE @@ -0,0 +1,16 @@ +--- +license_expression: gpl-2.0-plus +is_license_notice: yes +--- + +License: + + This package is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This package is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. diff --git a/tests/licensedcode/data/match_unknown/unknown-license-expected.json b/tests/licensedcode/data/match_unknown/unknown-license-expected.json new file mode 100644 index 00000000000..c1a5d96ae66 --- /dev/null +++ b/tests/licensedcode/data/match_unknown/unknown-license-expected.json @@ -0,0 +1,40 @@ +{ + "license_detections": [ + { + "identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd", + "license_expression": "unknown", + "detection_count": 1 + } + ], + "files": [ + { + "path": "unknown.txt", + "type": "file", + "detected_license_expression": "unknown", + "detected_license_expression_spdx": "LicenseRef-scancode-unknown", + "license_detections": [ + { + "license_expression": "unknown", + "matches": [ + { + "score": 86.89, + "start_line": 1, + "end_line": 10, + "matched_length": 53, + "match_coverage": 100.0, + "matcher": "6-unknown", + "license_expression": "unknown", + "rule_identifier": "license-detection-unknown-296da2cbc15d2bba73baa1359cda5fc8bf39b942", + "rule_relevance": 100, + "rule_url": null + } + ], + "identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd" + } + ], + "license_clues": [], + "percentage_of_license_text": 86.89, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/licensedcode/data/match_unknown/unknown-license-text-expected.json b/tests/licensedcode/data/match_unknown/unknown-license-text-expected.json new file mode 100644 index 00000000000..36b30c7f43b --- /dev/null +++ b/tests/licensedcode/data/match_unknown/unknown-license-text-expected.json @@ -0,0 +1,45 @@ +{ + "license_detections": [ + { + "identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd", + "license_expression": "unknown", + "detection_count": 1, + "detection_log": [] + } + ], + "files": [ + { + "path": "unknown.txt", + "type": "file", + "detected_license_expression": "unknown", + "detected_license_expression_spdx": "LicenseRef-scancode-unknown", + "license_detections": [ + { + "license_expression": "unknown", + "matches": [ + { + "score": 86.89, + "start_line": 1, + "end_line": 10, + "matched_length": 53, + "match_coverage": 100.0, + "matcher": "6-unknown", + "license_expression": "unknown", + "rule_identifier": "license-detection-unknown-296da2cbc15d2bba73baa1359cda5fc8bf39b942", + "rule_relevance": 100, + "rule_url": null, + "matched_text": "form shall mean the preferred form for making\nthe purposes of this definition control\n[software] [is] [modified] [by] [someone] [else]\n\n\n\nrepresent, as a whole, an original work of authorship. For the purposes\n of this License, Derivative Works shall not include works that remain\n separable from, or merely link (or bind by name) [to] [the] interfaces of,\n the Work and Derivative Works thereof." + } + ], + "detection_log": [ + "unknown-match" + ], + "identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd" + } + ], + "license_clues": [], + "percentage_of_license_text": 86.89, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/licensedcode/data/match_unknown/unknown.txt b/tests/licensedcode/data/match_unknown/unknown.txt new file mode 100644 index 00000000000..d48495d4d2e --- /dev/null +++ b/tests/licensedcode/data/match_unknown/unknown.txt @@ -0,0 +1,10 @@ +form shall mean the preferred form for making +the purposes of this definition control +software is modified by someone else + + + +represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. \ No newline at end of file diff --git a/tests/licensedcode/test_match_unknown.py b/tests/licensedcode/test_match_unknown.py new file mode 100644 index 00000000000..03331ff9ec3 --- /dev/null +++ b/tests/licensedcode/test_match_unknown.py @@ -0,0 +1,96 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +import os + +from commoncode.testcase import FileBasedTesting +from licensedcode.index import LicenseIndex +from scancode_config import REGEN_TEST_FIXTURES +from scancode.cli_test_utils import check_json_scan +from scancode.cli_test_utils import run_scan_click + +from licensedcode.query import build_query + +from licensedcode.models import load_licenses +from licensedcode.models import get_rules +from licensedcode.models import get_all_spdx_key_tokens +from licensedcode.models import get_license_tokens + +from licensedcode.match_unknown import match_unknowns +from licensedcode.match_unknown import MATCH_UNKNOWN +from licensedcode.detection import LicenseMatchFromResult + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + + +class TestUnknownLicenses(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_match_unknowns_works(self): + rule_dir = self.get_test_loc('match_unknown/index_mini/rules/') + license_dir = self.get_test_loc('match_unknown/index_mini/licenses/') + licenses_db = load_licenses(license_dir) + rules = list(get_rules(licenses_db=licenses_db, rules_data_dir=rule_dir)) + spdx_tokens = set(get_all_spdx_key_tokens(licenses_db)) + license_tokens = set(get_license_tokens()) + idx = LicenseIndex( + rules=rules, + _spdx_tokens=spdx_tokens, + _license_tokens=license_tokens, + ) + + query_loc = self.get_test_loc('match_unknown/apache-2.0.LICENSE') + qry = build_query(location=query_loc, idx=idx) + + match = match_unknowns( + idx=idx, + query_run=qry.whole_query_run(), + automaton=idx.unknown_automaton, + ) + match.set_lines(qry.line_by_pos) + + assert match.matcher == MATCH_UNKNOWN + assert match.matched_text() + + assert LicenseMatchFromResult.from_dict(match.to_dict()) + + def test_unknown_licenses_works(self): + test_dir = self.get_test_loc('match_unknown/unknown.txt', copy=True) + result_file = self.get_temp_file('json') + args = [ + '--license', + '--unknown-licenses', + '--strip-root', + '--verbose', + '--json', result_file, + test_dir, + ] + run_scan_click(args) + test_loc = self.get_test_loc('match_unknown/unknown-license-expected.json') + check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES) + + def test_unknown_licenses_works_with_license_text(self): + test_dir = self.get_test_loc('match_unknown/unknown.txt', copy=True) + result_file = self.get_temp_file('json') + args = [ + '--license', + '--license-text', + '--license-text-diagnostics', + '--license-diagnostics', + '--unknown-licenses', + '--strip-root', + '--verbose', + '--json', result_file, + test_dir, + ] + run_scan_click(args) + test_loc = self.get_test_loc('match_unknown/unknown-license-text-expected.json') + check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES) +