diff --git a/CMakeLists.txt b/CMakeLists.txt index 0dedad7..83ae09f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,5 +17,7 @@ if(NOT PYTHONINTERP_FOUND) endif() enable_testing() +add_subdirectory(unit-test) + set(EDITORCONFIG_CMD ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/main.py) add_subdirectory(tests) diff --git a/editorconfig/fnmatch.py b/editorconfig/fnmatch.py index 1a7880b..9d5d04f 100644 --- a/editorconfig/fnmatch.py +++ b/editorconfig/fnmatch.py @@ -25,23 +25,36 @@ _cache = {} -LEFT_BRACE = re.compile( - r""" - - (?: ^ | [^\\] ) # Beginning of string or a character besides "\" - \{ # "{" - - """, re.VERBOSE -) - -RIGHT_BRACE = re.compile( +# Different varaints to handle number ranges. +# +# - AS_IS and ZEROS also matching numbers with leading '+' +# - AS_IS and ZEROS also match the zero as '-0', '+0' and plain '0' +# - ZEROS allow any number of leading zeros, so '1' also matches '00001' +# - JUSTIFIED handles leading zeros like bash. So {01..10} matches '01' but not +# '1'. Leading '+' is not allowed. + +# handle as is +AS_IS = 0 +# allow any number of leading zeros +ZEROS = 1 +# handle leading zeros like like bash +JUSTIFIED = 2 + +NUMBER_MODE = JUSTIFIED + +# Characters that must be escaped in square brackets. +# Note: The '-' might be used for a range ('a-z') and '^' for negation, +# so be careful! +CHARACTER_CLASS_SPECIAL = "^-]\\" + +# regex to check if a number is at least 2-digits with leading zero. +LEADING_ZERO = re.compile( r""" - - (?: ^ | [^\\] ) # Beginning of string or a character besides "\" - - \} # "}" - + ^ # match at start of string + [-+] ? # optional sign ('+' or '-') + 0 # a leading zero + \d # followed by another digit """, re.VERBOSE ) @@ -73,6 +86,7 @@ def fnmatch(name, pat): - ``[seq]`` matches any character in seq - ``[!seq]`` matches any char not in seq - ``{s1,s2,s3}`` matches any of the strings given (separated by commas) + - ``{no1..n2}`` matches any number from no1 and n2 An initial period in FILENAME is not special. Both FILENAME and PATTERN are first case-normalized @@ -84,137 +98,328 @@ def fnmatch(name, pat): return fnmatchcase(name, pat) -def cached_translate(pat): - if not pat in _cache: - res, num_groups = translate(pat) - regex = re.compile(res) - _cache[pat] = regex, num_groups - return _cache[pat] - - def fnmatchcase(name, pat): """Test whether FILENAME matches PATTERN, including case. This is a version of fnmatch() which doesn't case-normalize its arguments. """ + if not pat in _cache: + res = translate(pat) + regex = re.compile(res, re.DOTALL) + _cache[pat] = regex + else: + regex = _cache[pat] - regex, num_groups = cached_translate(pat) - match = regex.match(name) - if not match: - return False - pattern_matched = True - for (num, (min_num, max_num)) in zip(match.groups(), num_groups): - if num[0] == '0' or not (min_num <= int(num) <= max_num): - pattern_matched = False - break - return pattern_matched + return regex.match(name) -def translate(pat, nested=False): - """Translate a shell PATTERN to a regular expression. +def translate(pat): + regex = doTranslate(pat, DEFAULT) + #print("%s -> %s" % (pat, regex)) + return r'^%s$' % regex - There is no way to quote meta-characters. + +# translating glob as a top-level re +DEFAULT = 0 +# translating glob as the part inside a choice ('{}') +IN_BRACES = 1 + +def doTranslate(pat, state): + """Translate editorconfig shell GLOB PATTERN to a regular expression. """ - index, length = 0, len(pat) # Current index and length of pattern - brace_level = 0 - in_brackets = False - result = '' - is_escaped = False - matching_braces = (len(LEFT_BRACE.findall(pat)) == - len(RIGHT_BRACE.findall(pat))) - numeric_groups = [] + index=0 + length = len(pat) + regex = '' + while index < length: current_char = pat[index] index += 1 if current_char == '*': - pos = index - if pos < length and pat[pos] == '*': - result += '.*' + if index < length and pat[index] == '*': + regex += '.*' + index +=1 else: - result += '[^/]*' + regex += '[^/]*' elif current_char == '?': - result += '.' + regex += '[^/]' elif current_char == '[': - if in_brackets: - result += '\\[' - else: - pos = index - has_slash = False - while pos < length and pat[pos] != ']': - if pat[pos] == '/' and pat[pos-1] != '\\': - has_slash = True - break - pos += 1 - if has_slash: - result += '\\[' + pat[index:(pos + 1)] + '\\]' - index = pos + 2 - else: - if index < length and pat[index] in '!^': - index += 1 - result += '[^' - else: - result += '[' - in_brackets = True - elif current_char == '-': - if in_brackets: - result += current_char + pos = getClosingBracketIndex(pat, index, state) + if pos < 0: # either unclosed or contains slash + regex += '\\[' else: - result += '\\' + current_char - elif current_char == ']': - result += current_char - in_brackets = False - elif current_char == '{': - pos = index - has_comma = False - while pos < length and (pat[pos] != '}' or is_escaped): - if pat[pos] == ',' and not is_escaped: - has_comma = True - break - is_escaped = pat[pos] == '\\' and not is_escaped - pos += 1 - if not has_comma and pos < length: - num_range = NUMERIC_RANGE.match(pat[index:pos]) - if num_range: - numeric_groups.append(map(int, num_range.groups())) - result += r"([+-]?\d+)" - else: - inner_result, inner_groups = translate(pat[index:pos], - nested=True) - result += '\\{%s\\}' % (inner_result,) - numeric_groups += inner_groups + regex += '[' + regex += handleCharacterClass(pat, index, pos) + regex += ']' index = pos + 1 - elif matching_braces: - result += '(?:' - brace_level += 1 + elif current_char == '{': + pos, has_comma = getClosingBracesIndex(pat, index) + if pos < 0: # unclosed + regex += '\\{' else: - result += '\\{' + if not has_comma: + num_range = NUMERIC_RANGE.match(pat[index:pos]) + if num_range: + regex += globRange2Re(num_range.group(1), num_range.group(2)) + else: + inner = doTranslate(pat[index:pos], DEFAULT) + regex += '\\{%s\\}' % inner + index = pos + 1 + else: + inner = doTranslate(pat[index:pos], IN_BRACES) + regex += '(?:' + inner + ')' + index = pos +1 elif current_char == ',': - if brace_level > 0 and not is_escaped: - result += '|' + if state == IN_BRACES: + regex += '|' else: - result += '\\,' - elif current_char == '}': - if brace_level > 0 and not is_escaped: - result += ')' - brace_level -= 1 - else: - result += '\\}' + regex += '\\,' elif current_char == '/': - if pat[index:(index + 3)] == "**/": - result += "(?:/|/.*/)" + if pat[index:(index + 3)] == '**/': + regex += "(?:/|/.*/)" index += 3 else: - result += '/' - elif current_char != '\\': - result += re.escape(current_char) - if current_char == '\\': - if is_escaped: - result += re.escape(current_char) - is_escaped = not is_escaped + regex += '/' + elif current_char == '\\': + if index < length: + current_char = pat[index] + regex += re.escape(current_char) + index += 1 else: - is_escaped = False - if not nested: - result = r'(?s)%s\Z' % result - return result, numeric_groups + regex += re.escape(current_char) + #endif + #endwhile + return regex + + +def getClosingBracketIndex(pat, start, state): + """Find a closing bracket in pat starting from start. + Return the index of the closing bracket. + Returns -1 if + - no closing bracket was found + - a slash was found + - a comma was found AND the given state is IN_BRACES + """ + length = len(pat) + index = start + while index < length and pat[index] != ']': + if pat[index] == '/': + return -1 + elif pat[index] == ',' and state == IN_BRACES: + return -1 + if pat[index] == '\\': + index +=1 + index +=1 + + return index if index < length else -1 + + +def getClosingBracesIndex(pat, start): + """Find a closing brace in pat starting from start. + Returns the index of the closing brace and whether a comma was found. + If no closing brace was found, returns the index -1. In that + case it is irrelevant whether a comma was found or not. + """ + length = len(pat) + index = start + has_comma = False + while index < length and pat[index] != '}': + if pat[index] == ',': + has_comma = True + elif pat[index] == '{': + pos, icomma = getClosingBracesIndex(pat, index+1) + if pos >=0: + index = pos + elif icomma: + has_comma = True + elif pat[index] == '\\': + index += 1 + index += 1 + + if index >= length: + index = -1 + + return index, has_comma + +def handleCharacterClass(pat, start, end): + index = start + result = '' + + if pat[index] in '!^': + index += 1 + result += '^' + + while index < end: + if pat[index] == '\\': + if (index+1) < end: + index += 1 + if pat[index] in CHARACTER_CLASS_SPECIAL: + result += '\\' + result += pat[index] + else: + result += '\\\\' + elif pat[index] == '-': + result += pat[index] + else: + if pat[index] in CHARACTER_CLASS_SPECIAL: + result += '\\' + result += pat[index] + index += 1 + return result + + +def globRange2Re(lower, upper): + """Creates a regular expression for a range of numbers. + + The translation depends on JUSTIFY_NUMBERS. + + JUSTIFY_NUMBERS == False: + - {3..120} -> (?:\+?(?:[3-9]|[1-9][0-9]|1[0-1][0-9]|120)) + - {03..120} -> (?:\+?(?:[3-9]|[1-9][0-9]|1[0-1][0-9]|120)) + - {-03..120} -> (?:-(?:[1-3])|\+?(?:[0-9]|[1-9][0-9]|1[0-1][0-9]|120)) + + JUSTIFY_NUMBERS == True: + - {3..120} -> (?:[3-9]|[1-9][0-9]|1[0-1][0-9]|120) + - {03..120} -> (?:00[3-9]|0[1-9][0-9]|1[0-1][0-9]|120) + - {-03..120} -> (?:-(?:0[1-3])|00[0-9]|0[1-9][0-9]|1[0-1][0-9]|120) + """ + + width = -1 + if NUMBER_MODE == JUSTIFIED: + if LEADING_ZERO.match(lower) or LEADING_ZERO.match(upper): + width = max(len(lower.replace('+', '')), len(upper.replace('+', ''))) + + low_num = int(lower) + up_num = int(upper) + start = min(low_num, up_num) + end = max(low_num, up_num) + neg_part = '' + if start < 0: + if NUMBER_MODE != JUSTIFIED: + neg_start = -end if end < 0 else 0 + else: + neg_start = -end if end < 0 else 1 + neg_end = -start + neg_width = width + if end >= 0: + neg_width -= 1 + neg_part = num_re(neg_width, neg_start, neg_end, '') + if NUMBER_MODE == ZEROS: + neg_part = "\\-0*(?:%s)" % new_part + else: + neg_part = "\\-(?:%s)" % neg_part + if end < 0: + return "(?:%s)" % neg_part + else: + neg_part += '|' + start = 0 + + pos_part = num_re(width, start, end, '') + + if NUMBER_MODE == JUSTIFIED: + return '(?:%s%s)' % (neg_part, pos_part) + elif NUMBER_MODE == ZEROS: + return '(?:%s\\+?0*(?:%s))' % (neg_part, pos_part) + else: + return '(?:%s\\+?(?:%s))' % (neg_part, pos_part) + + +# how many decimal digit has the given number? +def digits(num): + num = num if num >= 0 else -num + if num < 10: + return 1 + elif num < 10: + return 1 + elif num < 100: + return 2 + elif num < 1000: + return 3 + else: + num = num//1000 + d=3 + while num > 0: + num //=10 + d +=1 + return d + +def num_re(a_width, min, max, suffix): + width = a_width if a_width > 0 else 0 + width10s = (a_width - 1) if a_width > 0 else 0 + + if min == max: + return "%0*d%s" % (width, min, suffix) + if min//10 == max//10: + if min >= 10 or width10s > 0: + return "%0*d[%d-%d]%s" % (width10s, min//10, min%10, max%10, suffix) + else: + return "[%d-%d]%s" % (min%10, max%10, suffix) + + re = "" + + # Short cut for justified 0-99* + if min == 0 and width >= digits(max) and digits(max) < digits(max+1): + while width > digits(max): + re += '0' + width -= 1 + for i in range(width): + re += "[0-9]" + + return re + + + # if min is not divisible by 10, create re to match the gap to the next + # number divisable by 10 + if min == 0 or min%10 != 0: + new_min = (min//10+1)*10 + re += num_re(width, min, new_min-1, suffix) + else: + new_min = min + + # move new_min forward to have the same number of digits like max + # create the needed re + new_suffix=suffix + "[0-9]" + div = 1 + while(digits(new_min) < digits(max)): + div *= 10 + next_min = pow(10, digits(new_min)) + if re != "": + re += "|" + re += num_re(width-digits(new_min)+1, new_min//div, (next_min-1)//div, new_suffix) + new_min = next_min + new_suffix += "[0-9]" + + # new_min now has the same number of digits like max + div = pow(10, digits(new_min)-1) + while div > 1: + new_max = (max // div)*div + if (new_max + div-1) == max: + # special handling for numbers ending with '9' + # We can handle it in this loop. + new_max = max + if new_min != new_max: + x=div + new_suffix="" + while x > 1: + new_suffix += "[0-9]" + x //= 10 + if re != "": + re += "|" + re += num_re(width-digits(new_min)+1, new_min//div, (new_max-1)//div, new_suffix) + + new_min = new_max + div //=10 + + if new_min < max: + if re != "": + re += "|" + re += num_re(width10s, new_min//10, (max)//10, "[0-%d]" % (max%10)) + elif new_min%10 != 9: + if re != "": + re += "|" + re += "%0*d" % (width, max) + # else: The number ended with '9'/'99'/'999'... and was handled in the loop above + + return re + diff --git a/unit-test/.gitignore b/unit-test/.gitignore new file mode 100644 index 0000000..b302db3 --- /dev/null +++ b/unit-test/.gitignore @@ -0,0 +1,13 @@ +create_re_utest + +# Generated by CMake +CMakeCache.txt +CMakeFiles +Makefile +cmake_install.cmake +install_manifest.txt + +# Generated by CTest +/Testing +CTestTestfile.cmake + diff --git a/unit-test/CMakeLists.txt b/unit-test/CMakeLists.txt new file mode 100644 index 0000000..4b40711 --- /dev/null +++ b/unit-test/CMakeLists.txt @@ -0,0 +1,135 @@ +# +# Build unit-tests +# + +enable_testing() + +function(glob_test glob regex) + string(REPLACE " " "_" test_name "Glob2Re_${glob}") + + # Is a test with this name already defined? + get_test_property(${test_name} LABELS labels) + if(NOT "${labels}" STREQUAL "NOTFOUND") + message(SEND_ERROR "Duplicate test: ${test_name}") + endif() + + add_test(${test_name} ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/create_re_utest.py + "${glob}" "${regex}") + set_tests_properties(${test_name} PROPERTIES "LABELS" "unit-test-glob") + +endfunction() + +# First test if the test command would really fail. +glob_test("ShouldFail-*.c" "^ShouldFail$") +set_tests_properties("Glob2Re_ShouldFail-*.c" PROPERTIES WILL_FAIL TRUE) + +# basics +glob_test("*.c" "^[^/]*\\\\.c$") +glob_test("a b.c" "^a\\\\ b\\\\.c$") +glob_test("test/**/b.c" "^test(?:/|/.*/)b\\\\.c$") +glob_test("test/*/b.c" "^test/[^/]*/b\\\\.c$") +glob_test("**/b.c" "^.*/b\\\\.c$") +glob_test("ab?.c" "^ab[^/]\\\\.c$") + +# escaped special chars +glob_test("a\\\\*.c" "^a\\\\*\\\\.c$") +glob_test("a\\\\?.c" "^a\\\\?\\\\.c$") +glob_test("a\\\\{.c" "^a\\\\{\\\\.c$") +glob_test("a\\\\[.c" "^a\\\\[\\\\.c$") + +# brackets +glob_test("[abc]" "^[abc]$") +glob_test("[a-z]" "^[a-z]$") + +# support ^ and ! as negation +glob_test("[!abc]" "^[^abc]$") +glob_test("[^abc]" "^[^abc]$") + +glob_test("[abc[]" "^[abc[]$") +glob_test("[abc\\\\[]" "^[abc[]$") +glob_test("[abc\\\\]]" "^[abc\\\\]]$") +glob_test("[abc\\\\[\\\\]]" "^[abc[\\\\]]$") +glob_test("[abc[\\\\]]" "^[abc[\\\\]]$") + +glob_test("[test[abc]" "^[test[abc]$") +glob_test("[test\\\\[abc]" "^[test[abc]$") +glob_test("[test\\\\]abc]" "^[test\\\\]abc]$") +glob_test("[test]abc]" "^[test]abc\\\\]$") +glob_test("[test" "^\\\\[test$") +glob_test("]test" "^\\\\]test$") + +# choice +glob_test("{}" "^\\\\{\\\\}$") +glob_test("{test}" "^\\\\{test\\\\}$") +glob_test("{test,case}" "^(?:test|case)$") +glob_test("{test,case,}" "^(?:test|case|)$") +glob_test("{test,case,[!abc]}" "^(?:test|case|[^abc])$") + +# comma inside brackets is choice separator +glob_test("{test[a,b]case,2}" "^(?:test\\\\[a|b\\\\]case|2)$") +glob_test("{test[a\\\\,b]case,2}" "^(?:test[a,b]case|2)$") + +# choice in choice +glob_test("test.{a-{first,second},b-{third,fourth}}" + "^test\\\\.(?:a\\\\-(?:first|second)|b\\\\-(?:third|fourth))$") + +# incomplete outer braces +glob_test("{test{test,case}" "^\\\\{test(?:test|case)$") +glob_test("test{test,case}}" "^test(?:test|case)\\\\}$") + +# back to back braces +glob_test("}test{" "^\\\\}test\\\\{$") + +# simple number ranges +glob_test("{0..9}" "^(?:[0-9])$") +glob_test("{1..5}" "^(?:[1-5])$") +glob_test("{+1..5}" "^(?:[1-5])$") +glob_test("{1..+5}" "^(?:[1-5])$") +glob_test("{+1..+5}" "^(?:[1-5])$") +glob_test("{5..1}" "^(?:[1-5])$") +glob_test("{0..5}" "^(?:[0-5])$") +glob_test("{0..15}" "^(?:[0-9]|1[0-5])$") +glob_test("{9..15}" "^(?:9|1[0-5])$") +glob_test("{10..15}" "^(?:1[0-5])$") +glob_test("{11..15}" "^(?:1[1-5])$") +glob_test("{11..20}" "^(?:1[1-9]|20)$") +glob_test("{11..23}" "^(?:1[1-9]|2[0-3])$") +glob_test("{11..32}" "^(?:1[1-9]|2[0-9]|3[0-2])$") +glob_test("{11..39}" "^(?:1[1-9]|[2-3][0-9])$") +glob_test("{1..39}" "^(?:[1-9]|[1-3][0-9])$") +glob_test("{0..39}" "^(?:[0-9]|[1-3][0-9])$") +glob_test("{0..999}" + "^(?:[0-9]|[1-9][0-9]|[1-9][0-9][0-9])$") +glob_test("{0..9999}" + "^(?:[0-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9])$") +glob_test("{0..99999}" + "^(?:[0-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9])$") + +glob_test("{-1..-5}" "^(?:\\\\-(?:[1-5]))$") +glob_test("{-5..-1}" "^(?:\\\\-(?:[1-5]))$") +glob_test("{0..-5}" "^(?:\\\\-(?:[1-5])|0)$") +glob_test("{-0..-5}" "^(?:\\\\-(?:[1-5])|0)$") +glob_test("{+0..-5}" "^(?:\\\\-(?:[1-5])|0)$") + +glob_test("{-3..5}" "^(?:\\\\-(?:[1-3])|[0-5])$") + +# justified number ranges +glob_test("{00..15}" "^(?:0[0-9]|1[0-5])$") +glob_test("{15..00}" "^(?:0[0-9]|1[0-5])$") +glob_test("{010..020}" "^(?:01[0-9]|020)$") +glob_test("{01..39}" "^(?:0[1-9]|[1-3][0-9])$") +glob_test("{00..39}" "^(?:0[0-9]|[1-3][0-9])$") +glob_test("{00..999}" + "^(?:[0-9][0-9][0-9])$") +glob_test("{0..0999}" + "^(?:0[0-9][0-9][0-9])$") +glob_test("{00..-999}" + "^(?:\\\\-(?:00[1-9]|0[1-9][0-9]|[1-9][0-9][0-9])|0000)$") +glob_test("{-00..9999}" + "^(?:[0-9][0-9][0-9][0-9])$") +glob_test("{0..-09999}" + "^(?:\\\\-(?:0000[1-9]|000[1-9][0-9]|00[1-9][0-9][0-9]|0[1-9][0-9][0-9][0-9])|000000)$") + +glob_test("{-03..5}" "^(?:\\\\-(?:0[1-3])|00[0-5])$") + + diff --git a/unit-test/create_re_utest.py b/unit-test/create_re_utest.py new file mode 100755 index 0000000..64f1f6e --- /dev/null +++ b/unit-test/create_re_utest.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import os,sys,inspect +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) +sys.path.insert(0,parentdir) + +from editorconfig.fnmatch import translate + +# MAIN + +if len(sys.argv) != 3: + print("Usage: create_re_utest "); + sys.exit(1) + +glob = sys.argv[1] +expected = sys.argv[2] +regex = translate(glob) + +rc = 0 + +if regex is None or regex == "": + print("ERROR: Can't translate \"%s\"\n", glob); + rc = 3 +elif regex != expected: + print("ERROR: \"%s\" Expected: \"%s\" Got: \"%s\"\n" % (glob, expected, regex)) + rc = 2 + +sys.exit(rc)