Skip to content

Commit a0a561c

Browse files
committed
Merge remote-tracking branch 'origin/main' into 667_mine_nuget_pipeline
2 parents 96faa82 + ea38f08 commit a0a561c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+121463
-53
lines changed

minecode/collectors/dockerhub.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
import requests
12+
from packageurl import PackageURL
13+
14+
from minecode import priority_router
15+
from minecode.miners.dockerhub import build_package_data
16+
from packagedb.models import PackageContentType
17+
18+
logger = logging.getLogger(__name__)
19+
handler = logging.StreamHandler()
20+
logger.addHandler(handler)
21+
logger.setLevel(logging.INFO)
22+
23+
24+
def fetch_dockerhub_repo_summary(name, namespace="library"):
25+
"""
26+
Fetch summary metadata for a Docker Hub repository.
27+
28+
Returns:
29+
dict or None: Full metadata JSON from the Docker Hub API, including:
30+
- description (str): Short description
31+
- full_description (str): Detailed description
32+
- is_private (bool): Privacy status
33+
34+
"""
35+
url = f"https://hub.docker.com/v2/repositories/{namespace}/{name}/"
36+
try:
37+
response = requests.get(url)
38+
response.raise_for_status()
39+
return response.json()
40+
except requests.exceptions.RequestException as err:
41+
logger.error(f"Error fetching repository metadata for {name}: {err}")
42+
return None
43+
44+
45+
def fetch_dockerhub_tags_metadata(name, namespace, tag=None):
46+
"""
47+
Search through Docker Hub tags for a given repository.
48+
- If `tag` is provided, return the JSON metadata for that tag (by name or digest).
49+
- If `tag` is None, return a list of all tag metadata.
50+
51+
Examples:
52+
fetch_dockerhub_tag_metadata("nginx", "1.25.2")
53+
fetch_dockerhub_tag_metadata("nginx", "sha256:3d8957cb61d0223de2ab1aa2ec91d29796eb82a81cdcc1e968c090c29606d648")
54+
fetch_dockerhub_tag_metadata("nginx") # returns all tags
55+
56+
"""
57+
page = 0
58+
page_size = 100
59+
all_results = []
60+
61+
while True:
62+
page += 1
63+
url = f"https://hub.docker.com/v2/repositories/{namespace}/{name}/tags/?page={page}&page_size={page_size}"
64+
try:
65+
response = requests.get(url)
66+
response.raise_for_status()
67+
data = response.json()
68+
69+
results = data.get("results", [])
70+
if not tag:
71+
all_results.extend(results) # collect everything
72+
else:
73+
for result in results:
74+
if tag.startswith("sha256") and result.get("digest") == tag:
75+
return [result]
76+
elif result.get("name") == tag:
77+
return [result]
78+
79+
# Check if more pages exist
80+
if not data.get("next") or page_size * page > data.get("count", 0):
81+
break # no more pages
82+
83+
except requests.exceptions.RequestException as err:
84+
logger.error(f"Error fetching tags for {name}, page {page}: {err}")
85+
return None
86+
87+
if not tag:
88+
return all_results # return collected list
89+
90+
return None # tag not found
91+
92+
93+
def map_dockerhub_package(package_url, pipelines, priority=0):
94+
"""
95+
Add a Dockerhub distribution `package_url` to the PackageDB.
96+
"""
97+
from minecode.model_utils import add_package_to_scan_queue
98+
from minecode.model_utils import merge_or_create_package
99+
100+
if not package_url.name:
101+
error = f"Missing package name in DockerHub Package URL: {package_url}"
102+
logger.error(error)
103+
return error
104+
105+
namespace = package_url.namespace or "library"
106+
summary = fetch_dockerhub_repo_summary(package_url.name, namespace)
107+
if not summary:
108+
error = f"Package does not exist on dockerhub: {package_url}"
109+
logger.error(error)
110+
return error
111+
112+
tags_metadata = fetch_dockerhub_tags_metadata(package_url.name, namespace, package_url.version)
113+
114+
packages = build_package_data(summary, tags_metadata, package_url)
115+
116+
error = None
117+
for package in packages:
118+
package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
119+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
120+
if error:
121+
break
122+
123+
if db_package:
124+
add_package_to_scan_queue(package=db_package, pipelines=pipelines, priority=priority)
125+
return error
126+
127+
128+
@priority_router.route("pkg:docker/.*")
129+
def process_request(purl_str, **kwargs):
130+
"""
131+
Process Dockerhub Package URL (PURL).
132+
ex:
133+
pkg:docker/nginx@latest
134+
pkg:docker/nginx@sha256:3d8957cb61d0223de2ab1aa2ec91d29796eb82a81cdcc1e968c090c29606d648
135+
"""
136+
from minecode.model_utils import DEFAULT_PIPELINES
137+
138+
addon_pipelines = kwargs.get("addon_pipelines", [])
139+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
140+
priority = kwargs.get("priority", 0)
141+
142+
package_url = PackageURL.from_string(purl_str)
143+
144+
error_msg = map_dockerhub_package(package_url, pipelines, priority)
145+
146+
if error_msg:
147+
return error_msg

minecode/collectors/hex.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
import requests
12+
from packageurl import PackageURL
13+
14+
from minecode.miners.hex import build_packages
15+
from minecode import priority_router
16+
from packagedb.models import PackageContentType
17+
18+
logger = logging.getLogger(__name__)
19+
handler = logging.StreamHandler()
20+
logger.addHandler(handler)
21+
logger.setLevel(logging.INFO)
22+
23+
24+
def get_hex_package_json(name):
25+
"""
26+
Return the metadata JSON for a package from hex.pm API.
27+
Example: https://hex.pm/api/packages/phoenix
28+
"""
29+
30+
url = f"https://hex.pm/api/packages/{name}"
31+
32+
try:
33+
response = requests.get(url)
34+
response.raise_for_status()
35+
return response.json()
36+
except requests.exceptions.HTTPError as err:
37+
logger.error(f"HTTP error occurred: {err}")
38+
39+
40+
def map_hex_package(package_url, pipelines, priority=0):
41+
"""
42+
Add a hex `package_url` to the PackageDB.
43+
"""
44+
from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package
45+
46+
name = package_url.name
47+
package_json = get_hex_package_json(name=name)
48+
49+
if not package_json:
50+
error = f"Package does not exist on hex.pm: {package_url}"
51+
logger.error(error)
52+
return error
53+
54+
packages = build_packages(metadata_dict=package_json, purl=package_url)
55+
56+
error = None
57+
for package in packages:
58+
package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
59+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
60+
if error:
61+
break
62+
if db_package:
63+
add_package_to_scan_queue(package=db_package, pipelines=pipelines, priority=priority)
64+
65+
return error
66+
67+
68+
@priority_router.route("pkg:hex/.*")
69+
def process_request(purl_str, **kwargs):
70+
"""
71+
Process `priority_resource_uri` containing a hex Package URL (PURL).
72+
"""
73+
from minecode.model_utils import DEFAULT_PIPELINES
74+
75+
addon_pipelines = kwargs.get("addon_pipelines", [])
76+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
77+
priority = kwargs.get("priority", 0)
78+
79+
package_url = PackageURL.from_string(purl_str)
80+
81+
error_msg = map_hex_package(package_url, pipelines, priority)
82+
83+
if error_msg:
84+
return error_msg

minecode/collectors/pub.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ def map_pub_package(package_url, pipelines, priority=0):
6060
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
6161
if error:
6262
break
63-
print(db_package)
6463
if db_package:
6564
add_package_to_scan_queue(package=db_package, pipelines=pipelines, priority=priority)
6665

minecode/collectors/swift.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
import json
10+
import logging
11+
from packageurl import PackageURL
12+
from minecode import priority_router
13+
from minecode.miners import github
14+
from minecode.miners.github import build_github_packages
15+
from packagedb.models import PackageContentType
16+
17+
logger = logging.getLogger(__name__)
18+
handler = logging.StreamHandler()
19+
logger.addHandler(handler)
20+
logger.setLevel(logging.INFO)
21+
22+
23+
def map_swift_package(package_url, pipelines, priority=0):
24+
"""
25+
Add a Swift distribution `package_url` to the PackageDB.
26+
"""
27+
from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package
28+
29+
namespace = package_url.namespace
30+
version = package_url.version
31+
32+
owner_name = namespace.split("/")[-1]
33+
34+
uri = f"https://api.github.com/repos/{owner_name}/{package_url.name}"
35+
_, response_text, _ = github.GithubSingleRepoVisitor(uri)
36+
repo_data = json.loads(response_text)
37+
repo_data["tags"] = [tag for tag in repo_data["tags"] if tag["name"] == version]
38+
packages = build_github_packages(json.dumps(repo_data), uri, package_url)
39+
40+
error = None
41+
for package in packages:
42+
package.type = "swift"
43+
package.namespace = namespace
44+
package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
45+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
46+
if error:
47+
break
48+
49+
if db_package:
50+
add_package_to_scan_queue(package=db_package, pipelines=pipelines, priority=priority)
51+
return error
52+
53+
54+
@priority_router.route("pkg:swift/.*")
55+
def process_request(purl_str, **kwargs):
56+
"""
57+
Process Swift Package URL (PURL).
58+
"""
59+
from minecode.model_utils import DEFAULT_PIPELINES
60+
61+
addon_pipelines = kwargs.get("addon_pipelines", [])
62+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
63+
priority = kwargs.get("priority", 0)
64+
65+
package_url = PackageURL.from_string(purl_str)
66+
error_msg = map_swift_package(package_url, pipelines, priority)
67+
68+
if error_msg:
69+
return error_msg

minecode/miners/dockerhub.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,3 +195,53 @@ def build_packages_from_jsonfile(metadata, uri=None, purl=None):
195195
package = scan_models.Package(**common_data)
196196
package.set_purl(purl)
197197
yield package
198+
199+
200+
def build_package_data(summary, tags_metadata, purl):
201+
"""
202+
Yield ScannedPackage built from PackageData API.
203+
"""
204+
205+
namespace = purl.namespace or "library"
206+
207+
short_desc = summary.get("description")
208+
long_desc = summary.get("full_description")
209+
descriptions = [d for d in (short_desc, long_desc) if d and d.strip()]
210+
description = "\n".join(descriptions)
211+
is_private = summary.get("is_private")
212+
213+
homepage_url = (
214+
f"https://hub.docker.com/_/{purl.name}"
215+
if namespace == "library"
216+
else f"https://hub.docker.com/r/{namespace}/{purl.name}"
217+
)
218+
219+
for tag_metadata in tags_metadata:
220+
tag_name = tag_metadata.get("name")
221+
size = tag_metadata.get("full_size")
222+
digest = tag_metadata.get("digest")
223+
sha256 = digest[7::] if digest else None
224+
225+
last_updater_username = tag_metadata.get("last_updater_username")
226+
parties = []
227+
if last_updater_username:
228+
parties.append(scan_models.Party(name=last_updater_username, role="usernmae"))
229+
230+
download_data = dict(
231+
type="docker",
232+
name=purl.name,
233+
namespace=purl.namespace,
234+
version=purl.version or tag_name,
235+
description=description,
236+
is_private=is_private,
237+
sha256=sha256,
238+
parties=parties,
239+
size=size,
240+
homepage_url=homepage_url,
241+
download_url=f"https://hub.docker.com/layers/{namespace}/{purl.name}/{tag_name}/images/{digest}",
242+
)
243+
244+
package = scan_models.PackageData.from_data(download_data)
245+
package.datasource_id = "dockerhub_repositories"
246+
package.set_purl(purl)
247+
yield package

0 commit comments

Comments
 (0)