Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions minecode_pipelines/miners/cran.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import json
from pathlib import Path
import requests
from packageurl import PackageURL


def fetch_cran_db(output_file="cran_db.json") -> Path:
"""
Download the CRAN package database (~250MB JSON) in a memory-efficient way.
Saves it to a file instead of loading everything into memory.
"""

url = "https://crandb.r-pkg.org/-/all"
output_path = Path(output_file)

with requests.get(url, stream=True) as response:
response.raise_for_status()
with output_path.open("wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)

return output_path


def extract_cran_packages(json_file_path: str) -> list:
"""
Extract package names and their versions from a CRAN DB JSON file.
ex:
{
"AATtools": {
"_id": "AATtools",
"_rev": "8-9ebb721d05b946f2b437b49e892c9e8c",
"name": "AATtools",
"versions": {
"0.0.1": {...},
"0.0.2": {...},
"0.0.3": {...}
}
}
"""
db_path = Path(json_file_path)
if not db_path.exists():
raise FileNotFoundError(f"File not found: {db_path}")

with open(db_path, encoding="utf-8") as f:
data = json.load(f)

for pkg_name, pkg_data in data.items():
versions = list(pkg_data.get("versions", {}).keys())
purls = []
for version in versions:
purl = PackageURL(
type="cran",
name=pkg_name,
version=version,
)
purls.append(purl.to_string())
yield purls
86 changes: 86 additions & 0 deletions minecode_pipelines/pipelines/mine_cran.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode

from minecode_pipelines import pipes
from minecode_pipelines.miners.cran import fetch_cran_db
from minecode_pipelines.pipes import cran


MINECODE_DATA_CRAN_REPO = os.environ.get(
"MINECODE_DATA_CRAN_REPO", "https://github.com/aboutcode-data/minecode-data-cran-test"
)


class MineCran(Pipeline):
"""
Mine all packageURLs from a CRAN R index and publish them to a FederatedCode repo.
"""

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.setup_federatedcode_cran,
cls.mine_and_publish_cran_packageurls,
cls.cleanup_db_and_repo,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available(logger=self.log)

def setup_federatedcode_cran(self):
"""
Clone the FederatedCode CRAN repository and download the CRAN DB JSON file.
"""
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CRAN_REPO)
self.db_path = fetch_cran_db()

if self.log:
self.log(
f"{MINECODE_DATA_CRAN_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
)

def mine_and_publish_cran_packageurls(self):
"""Get cran packageURLs for all mined cran package names."""
cran.mine_and_publish_cran_packageurls(
cloned_data_repo=self.cloned_data_repo, db_path=self.db_path, logger=self.log
)

def cleanup_db_and_repo(self):
self.log(f"Cleaning database file at: {self.db_path}")
os.remove(self.db_path)

self.log(
f"Deleting cloned repo {MINECODE_DATA_CRAN_REPO} from: {self.cloned_data_repo.working_dir}"
)
pipes.delete_cloned_repos(
repos=[self.cloned_data_repo],
logger=self.log,
)
73 changes: 73 additions & 0 deletions minecode_pipelines/pipes/cran.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from aboutcode.hashid import get_package_purls_yml_file_path, get_core_purl
from scanpipe.pipes.federatedcode import commit_changes
from scanpipe.pipes.federatedcode import push_changes
from minecode_pipelines import VERSION
from minecode_pipelines.miners.cran import extract_cran_packages
from minecode_pipelines.pipes import write_data_to_yaml_file
from minecode_pipelines.utils import grouper

PACKAGE_BATCH_SIZE = 1000


def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
"""
Extract CRAN packages from the database, write their package URLs (purls) to YAML,
and commit changes in batches to the given cloned repository.
"""
packages_to_sync = list(extract_cran_packages(db_path))

for package_batch in grouper(packages_to_sync, PACKAGE_BATCH_SIZE):
purl_files = []
base_purls = []

if logger:
logger(f"Starting package mining for a batch of {PACKAGE_BATCH_SIZE} packages")

for updated_purls in package_batch:
if not updated_purls:
continue # skip padded None values or empty

first_purl = updated_purls[0]
base_purl = get_core_purl(first_purl)
purl_yaml_path = cloned_data_repo.working_dir / get_package_purls_yml_file_path(
first_purl
)
write_data_to_yaml_file(path=purl_yaml_path, data=updated_purls)

logger(f"writing packageURLs for package: {str(base_purl)} at: {purl_yaml_path}")
purl_files.append(purl_yaml_path)
base_purls.append(str(base_purl))

# After finishing the batch, commit & push if there’s something to save
if purl_files and base_purls:
commit_changes(
repo=cloned_data_repo,
files_to_commit=purl_files,
purls=base_purls,
mine_type="packageURL",
tool_name="pkg:pypi/minecode-pipelines",
tool_version=VERSION,
)
push_changes(repo=cloned_data_repo)
41 changes: 41 additions & 0 deletions minecode_pipelines/tests/pipes/test_cran.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import saneyaml
from pathlib import Path
from unittest import TestCase
from minecode_pipelines.miners.cran import extract_cran_packages


DATA_DIR = Path(__file__).parent.parent / "test_data" / "cran"


class CranPipelineTests(TestCase):
def test_extract_cran_packages_from_testdata(self):
"""
Ensure extract_cran_packages correctly parses the CRAN database
and produces results identical to the expected YAML files.
"""

db_file = DATA_DIR / "cran_db.json"
results = list(extract_cran_packages(db_file))

expected_files = [
DATA_DIR / "expected_abbreviate.yaml",
DATA_DIR / "expected_abc.data.yaml",
DATA_DIR / "expected_abc.yaml",
]

assert len(results) == len(expected_files)

for result, expected_file in zip(results, expected_files):
with open(expected_file, encoding="utf-8") as f:
expected = saneyaml.load(f)

assert result == expected
Loading
Loading