generated from aboutcode-org/skeleton
-
-
Notifications
You must be signed in to change notification settings - Fork 36
Add support to mine CRAN Package-URLs #688
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# | ||
# Copyright (c) nexB Inc. and others. All rights reserved. | ||
# purldb is a trademark of nexB Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||
# See https://github.com/aboutcode-org/purldb for support or download. | ||
# See https://aboutcode.org for more information about nexB OSS projects. | ||
# | ||
import json | ||
from pathlib import Path | ||
import requests | ||
from packageurl import PackageURL | ||
|
||
|
||
def fetch_cran_db(output_file="cran_db.json") -> Path: | ||
""" | ||
Download the CRAN package database (~250MB JSON) in a memory-efficient way. | ||
Saves it to a file instead of loading everything into memory. | ||
""" | ||
|
||
url = "https://crandb.r-pkg.org/-/all" | ||
output_path = Path(output_file) | ||
|
||
with requests.get(url, stream=True) as response: | ||
response.raise_for_status() | ||
with output_path.open("wb") as f: | ||
for chunk in response.iter_content(chunk_size=8192): | ||
f.write(chunk) | ||
|
||
return output_path | ||
|
||
|
||
def extract_cran_packages(json_file_path: str) -> list: | ||
""" | ||
Extract package names and their versions from a CRAN DB JSON file. | ||
ex: | ||
{ | ||
"AATtools": { | ||
"_id": "AATtools", | ||
"_rev": "8-9ebb721d05b946f2b437b49e892c9e8c", | ||
"name": "AATtools", | ||
"versions": { | ||
"0.0.1": {...}, | ||
"0.0.2": {...}, | ||
"0.0.3": {...} | ||
} | ||
} | ||
""" | ||
db_path = Path(json_file_path) | ||
if not db_path.exists(): | ||
raise FileNotFoundError(f"File not found: {db_path}") | ||
|
||
with open(db_path, encoding="utf-8") as f: | ||
data = json.load(f) | ||
|
||
for pkg_name, pkg_data in data.items(): | ||
versions = list(pkg_data.get("versions", {}).keys()) | ||
purls = [] | ||
for version in versions: | ||
purl = PackageURL( | ||
type="cran", | ||
name=pkg_name, | ||
version=version, | ||
) | ||
purls.append(purl.to_string()) | ||
yield purls |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# http://nexb.com and https://github.com/aboutcode-org/scancode.io | ||
# The ScanCode.io software is licensed under the Apache License version 2.0. | ||
# Data generated with ScanCode.io is provided as-is without warranties. | ||
# ScanCode is a trademark of nexB Inc. | ||
# | ||
# You may not use this software except in compliance with the License. | ||
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software distributed | ||
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
# CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations under the License. | ||
# | ||
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES | ||
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from | ||
# ScanCode.io should be considered or used as legal advice. Consult an Attorney | ||
# for any legal advice. | ||
# | ||
# ScanCode.io is a free software code scanning tool from nexB Inc. and others. | ||
# Visit https://github.com/aboutcode-org/scancode.io for support and download. | ||
|
||
import os | ||
from scanpipe.pipelines import Pipeline | ||
from scanpipe.pipes import federatedcode | ||
|
||
from minecode_pipelines import pipes | ||
from minecode_pipelines.miners.cran import fetch_cran_db | ||
from minecode_pipelines.pipes import cran | ||
|
||
|
||
MINECODE_DATA_CRAN_REPO = os.environ.get( | ||
"MINECODE_DATA_CRAN_REPO", "https://github.com/aboutcode-data/minecode-data-cran-test" | ||
) | ||
|
||
|
||
class MineCran(Pipeline): | ||
""" | ||
Mine all packageURLs from a CRAN R index and publish them to a FederatedCode repo. | ||
""" | ||
|
||
@classmethod | ||
def steps(cls): | ||
return ( | ||
cls.check_federatedcode_eligibility, | ||
cls.setup_federatedcode_cran, | ||
cls.mine_and_publish_cran_packageurls, | ||
cls.cleanup_db_and_repo, | ||
) | ||
|
||
def check_federatedcode_eligibility(self): | ||
""" | ||
Check if the project fulfills the following criteria for | ||
pushing the project result to FederatedCode. | ||
""" | ||
federatedcode.check_federatedcode_configured_and_available(logger=self.log) | ||
|
||
def setup_federatedcode_cran(self): | ||
""" | ||
Clone the FederatedCode CRAN repository and download the CRAN DB JSON file. | ||
""" | ||
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CRAN_REPO) | ||
self.db_path = fetch_cran_db() | ||
|
||
if self.log: | ||
self.log( | ||
f"{MINECODE_DATA_CRAN_REPO} repo cloned at: {self.cloned_data_repo.working_dir}" | ||
) | ||
|
||
def mine_and_publish_cran_packageurls(self): | ||
"""Get cran packageURLs for all mined cran package names.""" | ||
cran.mine_and_publish_cran_packageurls( | ||
cloned_data_repo=self.cloned_data_repo, db_path=self.db_path, logger=self.log | ||
) | ||
|
||
def cleanup_db_and_repo(self): | ||
self.log(f"Cleaning database file at: {self.db_path}") | ||
os.remove(self.db_path) | ||
|
||
self.log( | ||
f"Deleting cloned repo {MINECODE_DATA_CRAN_REPO} from: {self.cloned_data_repo.working_dir}" | ||
) | ||
pipes.delete_cloned_repos( | ||
repos=[self.cloned_data_repo], | ||
logger=self.log, | ||
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# http://nexb.com and https://github.com/aboutcode-org/scancode.io | ||
# The ScanCode.io software is licensed under the Apache License version 2.0. | ||
# Data generated with ScanCode.io is provided as-is without warranties. | ||
# ScanCode is a trademark of nexB Inc. | ||
# | ||
# You may not use this software except in compliance with the License. | ||
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software distributed | ||
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
# CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations under the License. | ||
# | ||
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES | ||
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from | ||
# ScanCode.io should be considered or used as legal advice. Consult an Attorney | ||
# for any legal advice. | ||
# | ||
# ScanCode.io is a free software code scanning tool from nexB Inc. and others. | ||
# Visit https://github.com/aboutcode-org/scancode.io for support and download. | ||
|
||
from aboutcode.hashid import get_package_purls_yml_file_path, get_core_purl | ||
from scanpipe.pipes.federatedcode import commit_changes | ||
from scanpipe.pipes.federatedcode import push_changes | ||
from minecode_pipelines import VERSION | ||
from minecode_pipelines.miners.cran import extract_cran_packages | ||
from minecode_pipelines.pipes import write_data_to_yaml_file | ||
from minecode_pipelines.utils import grouper | ||
|
||
PACKAGE_BATCH_SIZE = 1000 | ||
|
||
|
||
def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger): | ||
""" | ||
Extract CRAN packages from the database, write their package URLs (purls) to YAML, | ||
and commit changes in batches to the given cloned repository. | ||
""" | ||
packages_to_sync = list(extract_cran_packages(db_path)) | ||
|
||
for package_batch in grouper(packages_to_sync, PACKAGE_BATCH_SIZE): | ||
purl_files = [] | ||
base_purls = [] | ||
|
||
if logger: | ||
logger(f"Starting package mining for a batch of {PACKAGE_BATCH_SIZE} packages") | ||
|
||
for updated_purls in package_batch: | ||
if not updated_purls: | ||
continue # skip padded None values or empty | ||
|
||
first_purl = updated_purls[0] | ||
base_purl = get_core_purl(first_purl) | ||
purl_yaml_path = cloned_data_repo.working_dir / get_package_purls_yml_file_path( | ||
first_purl | ||
) | ||
write_data_to_yaml_file(path=purl_yaml_path, data=updated_purls) | ||
|
||
logger(f"writing packageURLs for package: {str(base_purl)} at: {purl_yaml_path}") | ||
purl_files.append(purl_yaml_path) | ||
base_purls.append(str(base_purl)) | ||
|
||
# After finishing the batch, commit & push if there’s something to save | ||
if purl_files and base_purls: | ||
commit_changes( | ||
repo=cloned_data_repo, | ||
files_to_commit=purl_files, | ||
purls=base_purls, | ||
mine_type="packageURL", | ||
tool_name="pkg:pypi/minecode-pipelines", | ||
tool_version=VERSION, | ||
) | ||
push_changes(repo=cloned_data_repo) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# | ||
# Copyright (c) nexB Inc. and others. All rights reserved. | ||
# purldb is a trademark of nexB Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||
# See https://github.com/aboutcode-org/purldb for support or download. | ||
# See https://aboutcode.org for more information about nexB OSS projects. | ||
# | ||
|
||
import saneyaml | ||
from pathlib import Path | ||
from unittest import TestCase | ||
from minecode_pipelines.miners.cran import extract_cran_packages | ||
|
||
|
||
DATA_DIR = Path(__file__).parent.parent / "test_data" / "cran" | ||
|
||
|
||
class CranPipelineTests(TestCase): | ||
def test_extract_cran_packages_from_testdata(self): | ||
""" | ||
Ensure extract_cran_packages correctly parses the CRAN database | ||
and produces results identical to the expected YAML files. | ||
""" | ||
|
||
db_file = DATA_DIR / "cran_db.json" | ||
results = list(extract_cran_packages(db_file)) | ||
|
||
expected_files = [ | ||
DATA_DIR / "expected_abbreviate.yaml", | ||
DATA_DIR / "expected_abc.data.yaml", | ||
DATA_DIR / "expected_abc.yaml", | ||
] | ||
|
||
assert len(results) == len(expected_files) | ||
|
||
for result, expected_file in zip(results, expected_files): | ||
with open(expected_file, encoding="utf-8") as f: | ||
expected = saneyaml.load(f) | ||
|
||
assert result == expected |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.