Skip to content

Commit ea38f08

Browse files
authored
664 purl next debian (#718)
* Create pipeline to collect purls from debian directory listing #664 Signed-off-by: Jono Yang <[email protected]> * Download and Remove index file in constructor/destructor #664 Signed-off-by: Jono Yang <[email protected]> * Make repo deletion a pipelines tep #664 Signed-off-by: Jono Yang <[email protected]> * Download index as part of constructor #664 Signed-off-by: Jono Yang <[email protected]> * Add support for skipping older files #664 Signed-off-by: Jono Yang <[email protected]> * Fix logic in destructor #664 Signed-off-by: Jono Yang <[email protected]> * Add tests for ls #664 Signed-off-by: Jono Yang <[email protected]> * Add mine_debian pipeline to pyproject.toml #664 Signed-off-by: Jono Yang <[email protected]> * Only compare dates if it has been given #664 Signed-off-by: Jono Yang <[email protected]> * Import ls module from minecode_pipelines #664 Signed-off-by: Jono Yang <[email protected]> * Track index download info to facilitate cleanup #664 Signed-off-by: Jono Yang <[email protected]> * Update how we handle download cleanup #664 Signed-off-by: Jono Yang <[email protected]> * Update debian pipeline #664 Signed-off-by: Jono Yang <[email protected]> * Commit purls if we have uncommited purls at the end #664 Signed-off-by: Jono Yang <[email protected]> * Call code from proper module #664 Signed-off-by: Jono Yang <[email protected]> * Remove old linter rule #664 Signed-off-by: Jono Yang <[email protected]> --------- Signed-off-by: Jono Yang <[email protected]>
1 parent ebee41d commit ea38f08

16 files changed

+1179
-4
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import federatedcode
25+
26+
from minecode_pipelines import pipes
27+
from minecode_pipelines.pipes import debian
28+
29+
30+
class MineDebian(Pipeline):
31+
"""
32+
Mine all packageURLs from a Debian index and publish them to
33+
a FederatedCode repo.
34+
"""
35+
36+
@classmethod
37+
def steps(cls):
38+
return (
39+
cls.check_federatedcode_eligibility,
40+
cls.collect_packages_from_debian,
41+
cls.delete_cloned_repos,
42+
)
43+
44+
def check_federatedcode_eligibility(self):
45+
"""
46+
Check if the project fulfills the following criteria for
47+
pushing the project result to FederatedCode.
48+
"""
49+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
50+
51+
def collect_packages_from_debian(self):
52+
self.repos = debian.collect_packages_from_debian(logger=self.log)
53+
54+
def delete_cloned_repos(self):
55+
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)

minecode_pipelines/pipelines/mine_maven.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def check_federatedcode_eligibility(self):
5555
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
5656

5757
def collect_packages_from_maven(self):
58-
self.repos = maven.collect_packages_from_maven(self.project, self.log)
58+
self.repos = maven.collect_packages_from_maven(logger=self.log)
5959

6060
def delete_cloned_repos(self):
6161
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)

minecode_pipelines/pipes/debian.py

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
import gzip
24+
from datetime import datetime
25+
from shutil import rmtree
26+
27+
import debian_inspector
28+
from aboutcode import hashid
29+
from commoncode import fileutils
30+
from commoncode.date import get_file_mtime
31+
from packagedcode.models import PackageData
32+
from packageurl import PackageURL
33+
from scanpipe.pipes import federatedcode
34+
from scanpipe.pipes.fetch import fetch_http
35+
36+
from minecode_pipelines import pipes
37+
from minecode_pipelines import VERSION
38+
from minecode_pipelines.pipes import ls
39+
40+
41+
DEBIAN_CHECKPOINT_PATH = "debian/checkpoints.json"
42+
DEBIAN_LSLR_URL = "http://ftp.debian.org/debian/ls-lR.gz"
43+
44+
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
45+
MINECODE_DATA_DEBIAN_REPO = "https://github.com/aboutcode-data/minecode-data-debian-test"
46+
47+
PACKAGE_BATCH_SIZE = 500
48+
49+
50+
def is_collectible(file_name):
51+
"""Return True if a `file_name` is collectible."""
52+
# 'Contents-*.gz' are mapping/indexes of installed files to the actual package that provides them.
53+
# TODO: add tests!
54+
55+
return file_name and (
56+
file_name
57+
in (
58+
"Packages.gz",
59+
"Release",
60+
"Sources.gz",
61+
)
62+
or file_name.endswith(
63+
(
64+
".deb",
65+
".dsc",
66+
)
67+
)
68+
or (file_name.startswith("Contents-") and file_name.endswith(".gz"))
69+
)
70+
71+
72+
def is_debian_url(uri):
73+
return "debian.org" in uri
74+
75+
76+
def is_ubuntu_url(uri):
77+
return "ubuntu" in uri
78+
79+
80+
class DebianCollector:
81+
"""
82+
Download and process a Debian ls-lR.gz file for Packages
83+
"""
84+
85+
def __init__(self, index_location=None):
86+
self.downloads = []
87+
if index_location:
88+
self.index_location = index_location
89+
else:
90+
index_download = self._fetch_index()
91+
self.index_location = index_download.path
92+
93+
def __del__(self):
94+
if self.downloads:
95+
for download in self.downloads:
96+
rmtree(download.directory)
97+
98+
def _fetch_http(self, uri):
99+
fetched = fetch_http(uri)
100+
self.downloads.append(fetched)
101+
return fetched
102+
103+
def _fetch_index(self, uri=DEBIAN_LSLR_URL):
104+
"""
105+
Fetch the Debian index at `uri` and return a Download with information
106+
about where it was saved.
107+
"""
108+
index = self._fetch_http(uri)
109+
return index
110+
111+
def get_packages(self, previous_index_last_modified_date=None, logger=None):
112+
"""Yield Package objects from debian index"""
113+
with gzip.open(self.index_location, "rt") as f:
114+
content = f.read()
115+
116+
url_template = DEBIAN_LSLR_URL.replace("ls-lR.gz", "{path}")
117+
if previous_index_last_modified_date:
118+
previous_index_last_modified_date = datetime.strptime(
119+
previous_index_last_modified_date, "%Y-%m-%d %H:%M:%S"
120+
)
121+
for entry in ls.parse_directory_listing(content):
122+
entry_date = datetime.strptime(entry.date, "%Y-%m-%d")
123+
if (entry.type != ls.FILE) or (
124+
previous_index_last_modified_date
125+
and (entry_date <= previous_index_last_modified_date)
126+
):
127+
continue
128+
129+
path = entry.path.lstrip("/")
130+
file_name = fileutils.file_name(path)
131+
132+
if not is_collectible(file_name):
133+
continue
134+
135+
if file_name.endswith((".deb", ".udeb", ".tar.gz", ".tar.xz", ".tar.bz2", ".tar.lzma")):
136+
name, version, arch = debian_inspector.package.get_nva(file_name)
137+
package_url = PackageURL(
138+
type="deb",
139+
namespace="debian",
140+
name=name,
141+
version=str(version),
142+
qualifiers=dict(arch=arch) if arch else None,
143+
)
144+
else:
145+
package_url = None
146+
147+
if not package_url:
148+
continue
149+
150+
versionless_purl = PackageURL(
151+
type=package_url.type,
152+
namespace=package_url.namespace,
153+
name=package_url.name,
154+
)
155+
packaged_data = PackageData(
156+
type=package_url.type,
157+
namespace=package_url.namespace,
158+
name=package_url.name,
159+
version=package_url.version,
160+
qualifiers=package_url.qualifiers,
161+
file_name=file_name,
162+
date=entry.date,
163+
size=entry.size,
164+
download_url=url_template.format(path=path),
165+
)
166+
yield versionless_purl, packaged_data
167+
168+
169+
def collect_packages_from_debian(commits_per_push=PACKAGE_BATCH_SIZE, logger=None):
170+
# Clone data and config repo
171+
data_repo = federatedcode.clone_repository(
172+
repo_url=MINECODE_DATA_DEBIAN_REPO,
173+
logger=logger,
174+
)
175+
config_repo = federatedcode.clone_repository(
176+
repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO,
177+
logger=logger,
178+
)
179+
if logger:
180+
logger(f"{MINECODE_DATA_DEBIAN_REPO} repo cloned at: {data_repo.working_dir}")
181+
logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}")
182+
183+
# get last_modified to see if we can skip files
184+
checkpoint = pipes.get_checkpoint_from_file(
185+
cloned_repo=config_repo, path=DEBIAN_CHECKPOINT_PATH
186+
)
187+
last_modified = checkpoint.get("previous_debian_index_last_modified_date")
188+
if logger:
189+
logger(f"previous_debian_index_last_modified_date: {last_modified}")
190+
191+
# download and iterate through debian index
192+
debian_collector = DebianCollector()
193+
prev_purl = None
194+
current_purls = []
195+
for i, (current_purl, package) in enumerate(
196+
debian_collector.get_packages(previous_index_last_modified_date=last_modified), start=1
197+
):
198+
if not prev_purl:
199+
prev_purl = current_purl
200+
elif prev_purl != current_purl:
201+
# write packageURLs to file
202+
package_base_dir = hashid.get_package_base_dir(purl=prev_purl)
203+
purl_file = pipes.write_packageurls_to_file(
204+
repo=data_repo,
205+
base_dir=package_base_dir,
206+
packageurls=current_purls,
207+
)
208+
209+
# commit changes
210+
federatedcode.commit_changes(
211+
repo=data_repo,
212+
files_to_commit=[purl_file],
213+
purls=current_purls,
214+
mine_type="packageURL",
215+
tool_name="pkg:pypi/minecode-pipelines",
216+
tool_version=VERSION,
217+
)
218+
219+
# Push changes to remote repository
220+
push_commit = not bool(i % commits_per_push)
221+
if push_commit:
222+
federatedcode.push_changes(repo=data_repo)
223+
224+
current_purls = []
225+
prev_purl = current_purl
226+
current_purls.append(package.to_string())
227+
228+
if current_purls:
229+
# write packageURLs to file
230+
package_base_dir = hashid.get_package_base_dir(purl=prev_purl)
231+
purl_file = pipes.write_packageurls_to_file(
232+
repo=data_repo,
233+
base_dir=package_base_dir,
234+
packageurls=current_purls,
235+
)
236+
237+
# commit changes
238+
federatedcode.commit_changes(
239+
repo=data_repo,
240+
files_to_commit=[purl_file],
241+
purls=current_purls,
242+
mine_type="packageURL",
243+
tool_name="pkg:pypi/minecode-pipelines",
244+
tool_version=VERSION,
245+
)
246+
247+
# Push changes to remote repository
248+
federatedcode.push_changes(repo=data_repo)
249+
250+
last_modified = get_file_mtime(debian_collector.index_location)
251+
checkpoint = {"previous_debian_index_last_modified_date": last_modified}
252+
if logger:
253+
logger(f"checkpoint: {checkpoint}")
254+
pipes.update_checkpoints_in_github(
255+
checkpoint=checkpoint, cloned_repo=config_repo, path=DEBIAN_CHECKPOINT_PATH
256+
)
257+
258+
repos_to_clean = [data_repo, config_repo]
259+
return repos_to_clean

0 commit comments

Comments
 (0)