Skip to content

Commit 7773e81

Browse files
progvaledmondchuc
authored andcommitted
jsonld: Do not merge nodes with different invalid URIs (#3011)
When parsing JSON-LD with invalid URIs in the `@id`, the `generalized_rdf: True` option allows parsing these nodes as blank nodes instead of outright rejecting the document. However, all nodes with invalid URIs were mapped to the same blank node, resulting in incorrect data. For example, without this patch, the new test fails with: ``` AssertionError: Expected: @Prefix schema: <https://schema.org/> . <https://example.org/root-object> schema:author [ schema:familyName "Doe" ; schema:givenName "Jane" ; schema:name "Jane Doe" ], [ schema:familyName "Doe" ; schema:givenName "John" ; schema:name "John Doe" ] . Got: @Prefix schema: <https://schema.org/> . <https://example.org/root-object> schema:author <> . <> schema:familyName "Doe" ; schema:givenName "Jane", "John" ; schema:name "Jane Doe", "John Doe" . ```
1 parent ec56b7f commit 7773e81

File tree

4 files changed

+49
-1
lines changed

4 files changed

+49
-1
lines changed

rdflib/plugins/parsers/jsonld.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
# we should consider streaming the input to deal with arbitrarily large graphs.
3535
from __future__ import annotations
3636

37+
import secrets
3738
import warnings
3839
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
3940

@@ -221,6 +222,7 @@ def __init__(
221222
if allow_lists_of_lists is not None
222223
else ALLOW_LISTS_OF_LISTS
223224
)
225+
self.invalid_uri_to_bnode: dict[str, BNode] = {}
224226

225227
def parse(self, data: Any, context: Context, dataset: Graph) -> Graph:
226228
topcontext = False
@@ -629,7 +631,12 @@ def _to_rdf_id(self, context: Context, id_val: str) -> Optional[IdentifiedNode]:
629631
uri = context.resolve(id_val)
630632
if not self.generalized_rdf and ":" not in uri:
631633
return None
632-
return URIRef(uri)
634+
node: IdentifiedNode = URIRef(uri)
635+
if not str(node):
636+
if id_val not in self.invalid_uri_to_bnode:
637+
self.invalid_uri_to_bnode[id_val] = BNode(secrets.token_urlsafe(20))
638+
node = self.invalid_uri_to_bnode[id_val]
639+
return node
633640

634641
def _get_bnodeid(self, ref: str) -> Optional[str]:
635642
if not ref.startswith("_:"):

test/jsonld/local-suite/manifest.jsonld

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@
2727
"purpose": "Multiple @id aliases. Issue #2164",
2828
"input": "toRdf-twoimports-in.jsonld",
2929
"expect": "toRdf-twoimports-out.nq"
30+
},
31+
{
32+
"@id": "#toRdf-two-invalid-ids",
33+
"@type": ["jld:PositiveEvaluationTest", "jld:ToRDFTest"],
34+
"name": "Two invalid identifiers",
35+
"purpose": "Multiple nodes with invalid @ids are not merged together.",
36+
"option": {
37+
"produceGeneralizedRdf": true
38+
},
39+
"input": "toRdf-twoinvalidids-in.jsonld",
40+
"expect": "toRdf-twoinvalidids-out.nq"
3041
}
3142
]
3243
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"@id": "https://example.org/root-object",
3+
"https://schema.org/author": [
4+
{
5+
"@id": "https://example.org/ invalid url 1",
6+
"https://schema.org/name": "Jane Doe"
7+
},
8+
{
9+
"@id": "https://example.org/ invalid url 1",
10+
"https://schema.org/givenName": "Jane",
11+
"https://schema.org/familyName": "Doe"
12+
},
13+
{
14+
"@id": "https://example.org/ invalid url 2",
15+
"https://schema.org/name": "John Doe",
16+
"https://schema.org/givenName": "John",
17+
"https://schema.org/familyName": "Doe"
18+
}
19+
]
20+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
<https://example.org/root-object> <https://schema.org/author> _:b1.
3+
<https://example.org/root-object> <https://schema.org/author> _:b2.
4+
5+
_:b1 <https://schema.org/name> "Jane Doe".
6+
_:b1 <https://schema.org/givenName> "Jane".
7+
_:b1 <https://schema.org/familyName> "Doe".
8+
_:b2 <https://schema.org/name> "John Doe".
9+
_:b2 <https://schema.org/givenName> "John".
10+
_:b2 <https://schema.org/familyName> "Doe".

0 commit comments

Comments
 (0)