diff --git a/rdflib/plugins/parsers/nquads.py b/rdflib/plugins/parsers/nquads.py index d44139c57..7c584cb4e 100644 --- a/rdflib/plugins/parsers/nquads.py +++ b/rdflib/plugins/parsers/nquads.py @@ -24,18 +24,27 @@ """ from codecs import getreader +import re from rdflib import ConjunctiveGraph # Build up from the NTriples parser: from rdflib.plugins.parsers.ntriples import W3CNTriplesParser from rdflib.plugins.parsers.ntriples import ParseError -from rdflib.plugins.parsers.ntriples import r_tail -from rdflib.plugins.parsers.ntriples import r_wspace +from rdflib.plugins.parsers.ntriples import tail +from rdflib.plugins.parsers.ntriples import wspace +from rdflib.plugins.parsers.ntriples import wspaces +from rdflib.plugins.parsers.ntriples import r_comment_or_empty __all__ = ["NQuadsParser"] +r_uriref_predicate_object_context = re.compile(wspace + r"([<_][^ ]+)" + + wspaces + r"(<[^ ]+)" + + wspaces + r'(".*[^\\]"[^ \t]*|<[^>]*>|_[^ ]*)' + + wspaces + r"([^ ]+)?" + + tail) + class NQuadsParser(W3CNTriplesParser): def parse(self, inputsource, sink, bnode_context=None, **kwargs): """ @@ -63,37 +72,37 @@ def parse(self, inputsource, sink, bnode_context=None, **kwargs): raise ParseError("Item to parse must be a file-like object.") self.file = source - self.buffer = "" - while True: - self.line = __line = self.readline() - if self.line is None: - break - try: - self.parseline(bnode_context) - except ParseError as msg: - raise ParseError("Invalid line (%s):\n%r" % (msg, __line)) + return self.processing_loop(bnode_context) - return self.sink + def parseline(self, the_line, bnode_context=None): + # This splits the line into four component because this is a quad. + # The logic is similar for triples, except the context as fourth component. + m = r_uriref_predicate_object_context.match(the_line) + if not m: + # Very rare case, so performances are less important. + if r_comment_or_empty.match(the_line): + return # The line is a comment + raise ParseError("Not a quad") - def parseline(self, bnode_context=None): - self.eat(r_wspace) - if (not self.line) or self.line.startswith(("#")): - return # The line is empty or a comment + first_token, second_token, third_token, fourth_token, _ = m.groups() - subject = self.subject(bnode_context) - self.eat(r_wspace) + subject = self.uriref(first_token) or self.nodeid(first_token, bnode_context) + if not subject: + raise ParseError("Subject must be uriref or nodeID") - predicate = self.predicate() - self.eat(r_wspace) + predicate = self.uriref(second_token) + if not predicate: + raise ParseError("Predicate must be uriref") - obj = self.object(bnode_context) - self.eat(r_wspace) + obj = self.uriref(third_token) or self.nodeid(third_token, bnode_context) or self.literal(third_token) + if obj is False: + raise ParseError("Unrecognised object type") - context = self.uriref() or self.nodeid(bnode_context) or self.sink.identifier - self.eat(r_tail) + if fourth_token: + context = self.uriref(fourth_token) or self.nodeid(fourth_token, bnode_context) + else: + context = self.sink.identifier - if self.line: - raise ParseError("Trailing garbage") # Must have a context aware store - add on a normal Graph # discards anything where the ctx != graph.identifier self.sink.get_context(context).add((subject, predicate, obj)) diff --git a/rdflib/plugins/parsers/ntriples.py b/rdflib/plugins/parsers/ntriples.py index e728fc354..fc15c8fdc 100644 --- a/rdflib/plugins/parsers/ntriples.py +++ b/rdflib/plugins/parsers/ntriples.py @@ -21,20 +21,30 @@ __all__ = ["unquote", "uriquote", "W3CNTriplesParser", "NTGraphSink", "NTParser"] uriref = r'<([^:]+:[^\s"<>]*)>' +# Consider a possibly faster regex: '(".*[^\\]"') literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"' litinfo = r"(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)|\^\^" + uriref + r")?" +wspace = r"[ \t]*" +wspaces = r"[ \t]+" +tail = r"[ \t]*\.[ \t]*(#.*)?" r_line = re.compile(r"([^\r\n]*)(?:\r\n|\r|\n)") -r_wspace = re.compile(r"[ \t]*") -r_wspaces = re.compile(r"[ \t]+") -r_tail = re.compile(r"[ \t]*\.[ \t]*(#.*)?") +r_wspace = re.compile(wspace) +r_wspaces = re.compile(wspaces) +r_tail = re.compile(tail) r_uriref = re.compile(uriref) r_nodeid = re.compile(r"_:([A-Za-z0-9_:]([-A-Za-z0-9_:\.]*[-A-Za-z0-9_:])?)") r_literal = re.compile(literal + litinfo) +# Should use wspace as soon as read in text mode. +r_comment_or_empty = re.compile(r"[ \t\r]*" + "(#.*)?$") -bufsiz = 2048 -validate = False +# https://www.w3.org/TR/n-triples/ +# The last item is a uriref (terminated by >), or a literal (terminated by ") or a node id (terminated by -A-Za-z0-9_:) +r_uriref_predicate_object = re.compile(wspace + r"([<_][^ ]+)" + + wspaces + r"(<[^ >]+>)" + + wspaces + r"([<_\"].*[-A-Za-z0-9_:\">])" + tail) +bufsiz = 2048 class DummySink(object): def __init__(self): @@ -51,54 +61,67 @@ def triple(self, s, p, o): r_uniquot = re.compile(r"\\u([0-9A-F]{4})|\\U([0-9A-F]{8})") -def unquote(s): - """Unquote an N-Triples string.""" - if not validate: - - if isinstance(s, str): # nquads - s = decodeUnicodeEscape(s) +def _unquote_validate(s): + """Unquote an N-Triples string in validation mode.""" + result = [] + while s: + m = r_safe.match(s) + if m: + s = s[m.end() :] + result.append(m.group(1)) + continue + + m = r_quot.match(s) + if m: + s = s[2:] + result.append(quot[m.group(1)]) + continue + + m = r_uniquot.match(s) + if m: + s = s[m.end() :] + u, U = m.groups() + codepoint = int(u or U, 16) + if codepoint > 0x10FFFF: + raise ParseError("Disallowed codepoint: %08X" % codepoint) + result.append(chr(codepoint)) + elif s.startswith("\\"): + raise ParseError("Illegal escape at: %s..." % s[:10]) else: - s = s.decode("unicode-escape") + raise ParseError("Illegal literal character: %r" % s[0]) + return "".join(result) - return s - else: - result = [] - while s: - m = r_safe.match(s) - if m: - s = s[m.end() :] - result.append(m.group(1)) - continue - - m = r_quot.match(s) - if m: - s = s[2:] - result.append(quot[m.group(1)]) - continue - - m = r_uniquot.match(s) - if m: - s = s[m.end() :] - u, U = m.groups() - codepoint = int(u or U, 16) - if codepoint > 0x10FFFF: - raise ParseError("Disallowed codepoint: %08X" % codepoint) - result.append(chr(codepoint)) - elif s.startswith("\\"): - raise ParseError("Illegal escape at: %s..." % s[:10]) - else: - raise ParseError("Illegal literal character: %r" % s[0]) - return "".join(result) + +def _unquote_not_validate(s): + """Unquote an N-Triples string if no validation is needed.""" + # Maybe there are no escape char, so no need to decode. + if "\\" in s: + s = decodeUnicodeEscape(s) + + return s r_hibyte = re.compile(r"([\x80-\xFF])") -def uriquote(uri): - if not validate: - return uri +def _uriquote_validate(uri): + return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri) + + +# This sets the proper functions to be used, +# and these functions do not need to check the validate flag. +def validate(value): + global unquote + global uriquote + if value: + unquote = _unquote_validate + uriquote = _uriquote_validate else: - return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri) + unquote = _unquote_not_validate + # uriquote does not do anything when no validation is needed. + uriquote = lambda x: x + +validate(False) class W3CNTriplesParser(object): @@ -127,9 +150,7 @@ def __init__(self, sink=None, bnode_context=None): else: self.sink = DummySink() - self.buffer = None self.file = None - self.line = "" def parse(self, f, bnode_context=None): """ @@ -143,7 +164,6 @@ def parse(self, f, bnode_context=None): passed in to define a distinct context for a given call to `parse`. """ - if not hasattr(f, "read"): raise ParseError("Item to parse must be a file-like object.") @@ -152,15 +172,17 @@ def parse(self, f, bnode_context=None): f = codecs.getreader("utf-8")(f) self.file = f - self.buffer = "" + return self.processing_loop(bnode_context) + + def processing_loop(self, bnode_context): while True: - self.line = self.readline() - if self.line is None: + the_line = self.file.readline() + if not the_line: break try: - self.parseline(bnode_context=bnode_context) + self.parseline(the_line, bnode_context=bnode_context) except ParseError: - raise ParseError("Invalid line: {}".format(self.line)) + raise ParseError("Invalid line: {}".format(the_line)) return self.sink def parsestring(self, s, **kwargs): @@ -176,90 +198,47 @@ def parsestring(self, s, **kwargs): def readline(self): """Read an N-Triples line from buffered input.""" # N-Triples lines end in either CRLF, CR, or LF - # Therefore, we can't just use f.readline() - if not self.buffer: - buffer = self.file.read(bufsiz) - if not buffer: - return None - self.buffer = buffer + return self.file.readline() - while True: - m = r_line.match(self.buffer) - if m: # the more likely prospect - self.buffer = self.buffer[m.end() :] - return m.group(1) - else: - buffer = self.file.read(bufsiz) - if not buffer and not self.buffer.isspace(): - # Last line does not need to be terminated with a newline - buffer += "\n" - elif not buffer: - return None - self.buffer += buffer - - def parseline(self, bnode_context=None): - self.eat(r_wspace) - if (not self.line) or self.line.startswith("#"): - return # The line is empty or a comment - - subject = self.subject(bnode_context) - self.eat(r_wspaces) - - predicate = self.predicate() - self.eat(r_wspaces) - - object_ = self.object(bnode_context) - self.eat(r_tail) - - if self.line: - raise ParseError("Trailing garbage: {}".format(self.line)) - self.sink.triple(subject, predicate, object_) + def parseline(self, the_line, bnode_context=None): + # This splits the line into three components. + m = r_uriref_predicate_object.match(the_line) + if not m: + # Very rare case, so performances are less important. + if r_comment_or_empty.match(the_line): + return # The line is a comment + raise ParseError("Not a triple") - def peek(self, token): - return self.line.startswith(token) - - def eat(self, pattern): - m = pattern.match(self.line) - if not m: # @@ Why can't we get the original pattern? - # print(dir(pattern)) - # print repr(self.line), type(self.line) - raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line)) - self.line = self.line[m.end() :] - return m - - def subject(self, bnode_context=None): - # @@ Consider using dictionary cases - subj = self.uriref() or self.nodeid(bnode_context) - if not subj: + first_token, second_token, third_token, _ = m.groups() + + subject = self.uriref(first_token) or self.nodeid(first_token, bnode_context) + if not subject: raise ParseError("Subject must be uriref or nodeID") - return subj - def predicate(self): - pred = self.uriref() - if not pred: + predicate = self.uriref(second_token) + if not predicate: raise ParseError("Predicate must be uriref") - return pred - def object(self, bnode_context=None): - objt = self.uriref() or self.nodeid(bnode_context) or self.literal() - if objt is False: + object_ = self.uriref(third_token) or self.nodeid(third_token, bnode_context) or self.literal(third_token) + if object_ is False: raise ParseError("Unrecognised object type") - return objt - def uriref(self): - if self.peek("<"): - uri = self.eat(r_uriref).group(1) + self.sink.triple(subject, predicate, object_) + + def uriref(self, the_string): + if the_string[0] == "<": + # This strips the opening and closing brackets. + uri = the_string[1:-1] uri = unquote(uri) uri = uriquote(uri) return URI(uri) return False - def nodeid(self, bnode_context=None): - if self.peek("_"): + def nodeid(self, bnode_id, bnode_context=None): + if bnode_id[0] == "_": # Fix for https://github.com/RDFLib/rdflib/issues/204 if bnode_context is None: bnode_context = self._bnode_ids - bnode_id = self.eat(r_nodeid).group(1) new_id = bnode_context.get(bnode_id, None) if new_id is not None: # Re-map to id specfic to this doc @@ -272,21 +251,19 @@ def nodeid(self, bnode_context=None): return bnode return False - def literal(self): - if self.peek('"'): - lit, lang, dtype = self.eat(r_literal).groups() - if lang: - lang = lang - else: + def literal(self, the_string): + if the_string[0] == '"': + lit, lang, dtype = r_literal.match(the_string).groups() + if not lang: lang = None if dtype: dtype = unquote(dtype) dtype = uriquote(dtype) dtype = URI(dtype) + if lang: + raise ParseError("Can't have both a language and a datatype") else: dtype = None - if lang and dtype: - raise ParseError("Can't have both a language and a datatype") lit = unquote(lit) return Literal(lit, lang, dtype) return False diff --git a/setup.py b/setup.py index 8dd82c2b8..b733ec725 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,36 @@ import os import re -from setuptools import setup, find_packages +import sys +import importlib +from distutils.core import setup +from setuptools import find_packages +from setuptools.command.build_ext import build_ext +from distutils.command.clean import clean + +# See https://gist.github.com/ctokheim/6c34dc1d672afca0676a for more details. +try: + from Cython.Build import cythonize + language_level = "3" +except ImportError: + cythonize = None + +# If cython is present, it is used by default. +if cythonize: + if '--without-cython' in sys.argv: + USE_CYTHON = False + sys.argv.remove('--without-cython') + elif '--no-cython' in sys.argv: + USE_CYTHON = False + sys.argv.remove('--no-cython') + else: + USE_CYTHON = True +else: + USE_CYTHON = False + +if 'clean' in sys.argv: + # Maybe Cython cannot work for some reason, and it would prevent cleaning up. + USE_CYTHON = False kwargs = {} kwargs["install_requires"] = ["isodate", "pyparsing"] @@ -37,6 +66,127 @@ def find_version(filename): # install examples, to get docstrings packages.append("examples") +class BuildExtCommand(build_ext): + """ + A custom command to build Cython extensions. + + This is needed otherwise setup does not build or rebuild C files from Python source files. + FIXME: Not sure why the presence of this custom command which does not do much, fix the build issue. + + Usage example: python setup.py build_ext --inplace + It ignores build-lib and put compiled extensions into the source directory alongside the pure Python modules. + These libraries are prioritized before *.py files by Python, and loaded instead. + """ + + description = 'Build Cython extensions' + + def initialize_options(self): + print("build_ext.initialize_options") + build_ext.initialize_options(self) + + def finalize_options(self): + """Post-process options.""" + print("build_ext.finalize_options") + build_ext.finalize_options(self) + + def run(self): + """Run command.""" + print("build_ext.run") + build_ext.run(self) + + +def _cythonizable_source_files(): + """ + This returns the list of absolute paths of files to be cythonized. + + Only a subset of files are compiled. + """ + survol_base_dir = os.path.join(os.path.dirname(__file__), "rdflib") + src_files = [] + + basenames_list = [ + "collection.py", + "compare.py", + "compat.py", + "container.py", + "events.py", + "exceptions.py", + "graph.py", + "namespace.py", + "parser.py", + "paths.py", + "plugin.py", + "query.py", + "resource.py", + "serializer.py", + "store.py", + "term.py", + "util.py", + "void.py", + "plugins/parsers/notation3.py", + "plugins/stores/memory.py", + ] + + for one_filename in basenames_list: + one_path_name = os.path.join(survol_base_dir, one_filename) + src_files.append(one_path_name) + return src_files + + +class CleanCommand(clean): + """ + Clean build including iniplace built extensions. + """ + + description = 'Clean build including in-place built extensions.' + + def _cleanup_libs(self): + def remove_lib_file(lib_path): + try: + os.remove(lib_path) + print("removed cythonized file:", lib_path) + except: + print("Cannot remove:", lib_path) + + src_files = _cythonizable_source_files() + for one_file in src_files: + assert one_file.endswith(".py") + file_without_extension = os.path.splitext(one_file)[0] + if sys.platform.startswith("lin") or sys.platform == "darwin": + lib_path = file_without_extension + ".so" + remove_lib_file(lib_path) + else: + # for example: ['.cp36-win_amd64.pyd', '.pyd'] + for one_suffix in importlib.machinery.EXTENSION_SUFFIXES: + # The file name might be something like: "collection.cp36-win_amd64.pyd" + lib_path = file_without_extension + one_suffix + remove_lib_file(lib_path) + + def run(self): + """Run command.""" + print("removing in-place built libs") + self._cleanup_libs() + clean.run(self) + +kwargs["cmdclass"]={ + 'build_ext': BuildExtCommand, + 'clean': CleanCommand, + } + +if USE_CYTHON: + kwargs["ext_modules"]=cythonize( + _cythonizable_source_files(), + build_dir="build_cythonize", + # nthreads = 3, + annotate=True, + compiler_directives={'language_level': language_level}) + + kwargs["options"]={ + 'build': {'build_lib': 'build_build_ext'}, + } + + + setup( name="rdflib", version=version, @@ -97,3 +247,4 @@ def find_version(filename): }, **kwargs ) + diff --git a/test/test_nt_misc.py b/test/test_nt_misc.py index 164776b8c..c74cc011a 100644 --- a/test/test_nt_misc.py +++ b/test/test_nt_misc.py @@ -68,39 +68,39 @@ def test_sink(self): def test_nonvalidating_unquote(self): safe = """ .""" - ntriples.validate = False + ntriples.validate(False) res = ntriples.unquote(safe) self.assertTrue(isinstance(res, str)) def test_validating_unquote(self): quot = """ .""" - ntriples.validate = True + ntriples.validate(True) res = ntriples.unquote(quot) # revert to default - ntriples.validate = False + ntriples.validate(False) log.debug("restype %s" % type(res)) def test_validating_unquote_raises(self): - ntriples.validate = True + ntriples.validate(True) uniquot = """ "R\\u00E4ksm\\u00F6rg\\u00E5s" .""" self.assertRaises(ntriples.ParseError, ntriples.unquote, uniquot) uniquot = """ "R\\\\u00E4ksm\\u00F6rg\\u00E5s" .""" self.assertRaises(ntriples.ParseError, ntriples.unquote, uniquot) # revert to default - ntriples.validate = False + ntriples.validate(False) def test_nonvalidating_uriquote(self): - ntriples.validate = False + ntriples.validate(False) safe = """ .""" res = ntriples.uriquote(safe) self.assertTrue(res == safe) def test_validating_uriquote(self): - ntriples.validate = True + ntriples.validate(True) uniquot = """ "R\\u00E4ksm\\u00F6rg\\u00E5s" .""" res = ntriples.uriquote(uniquot) # revert to default - ntriples.validate = False + ntriples.validate(False) self.assertEqual(res, uniquot) def test_W3CNTriplesParser_fpath(self): @@ -136,25 +136,6 @@ def test_bad_line(self): p = ntriples.W3CNTriplesParser() self.assertRaises(ntriples.ParseError, p.parsestring, data) - def test_cover_eat(self): - data = ( - """ 3 .\n""" - ) - p = ntriples.W3CNTriplesParser() - p.line = data - self.assertRaises( - ntriples.ParseError, p.eat, re.compile("") - ) - - def test_cover_subjectobjectliteral(self): - # data = ''' 3 .\n''' - p = ntriples.W3CNTriplesParser() - p.line = "baz" - self.assertRaises(ntriples.ParseError, p.subject) - self.assertRaises(ntriples.ParseError, p.object) - # p.line = '"baz"@fr^^' - # self.assertRaises(ntriples.ParseError, p.literal) - class BNodeContextTestCase(unittest.TestCase): def test_bnode_shared_across_instances(self):