chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

127
vendor/regex/testdata/anchored.toml vendored Normal file
View File

@@ -0,0 +1,127 @@
# These tests are specifically geared toward searches with 'anchored = true'.
# While they are interesting in their own right, they are particularly
# important for testing the one-pass DFA since the one-pass DFA can't work in
# unanchored contexts.
#
# Note that "anchored" in this context does not mean "^". Anchored searches are
# searches whose matches must begin at the start of the search, which may not
# be at the start of the haystack. That's why anchored searches---and there are
# some examples below---can still report multiple matches. This occurs when the
# matches are adjacent to one another.
[[test]]
name = "greedy"
regex = '(abc)+'
haystack = "abcabcabc"
matches = [
[[0, 9], [6, 9]],
]
anchored = true
# When a "earliest" search is used, greediness doesn't really exist because
# matches are reported as soon as they are known.
[[test]]
name = "greedy-earliest"
regex = '(abc)+'
haystack = "abcabcabc"
matches = [
[[0, 3], [0, 3]],
[[3, 6], [3, 6]],
[[6, 9], [6, 9]],
]
anchored = true
search-kind = "earliest"
[[test]]
name = "nongreedy"
regex = '(abc)+?'
haystack = "abcabcabc"
matches = [
[[0, 3], [0, 3]],
[[3, 6], [3, 6]],
[[6, 9], [6, 9]],
]
anchored = true
# When "all" semantics are used, non-greediness doesn't exist since the longest
# possible match is always taken.
[[test]]
name = "nongreedy-all"
regex = '(abc)+?'
haystack = "abcabcabc"
matches = [
[[0, 9], [6, 9]],
]
anchored = true
match-kind = "all"
[[test]]
name = "word-boundary-unicode-01"
regex = '\b\w+\b'
haystack = 'βββ☃'
matches = [[0, 6]]
anchored = true
[[test]]
name = "word-boundary-nounicode-01"
regex = '\b\w+\b'
haystack = 'abcβ'
matches = [[0, 3]]
anchored = true
unicode = false
# Tests that '.c' doesn't match 'abc' when performing an anchored search from
# the beginning of the haystack. This test found two different bugs in the
# PikeVM and the meta engine.
[[test]]
name = "no-match-at-start"
regex = '.c'
haystack = 'abc'
matches = []
anchored = true
# Like above, but at a non-zero start offset.
[[test]]
name = "no-match-at-start-bounds"
regex = '.c'
haystack = 'aabc'
bounds = [1, 4]
matches = []
anchored = true
# This is like no-match-at-start, but hits the "reverse inner" optimization
# inside the meta engine. (no-match-at-start hits the "reverse suffix"
# optimization.)
[[test]]
name = "no-match-at-start-reverse-inner"
regex = '.c[a-z]'
haystack = 'abcz'
matches = []
anchored = true
# Like above, but at a non-zero start offset.
[[test]]
name = "no-match-at-start-reverse-inner-bounds"
regex = '.c[a-z]'
haystack = 'aabcz'
bounds = [1, 5]
matches = []
anchored = true
# Same as no-match-at-start, but applies to the meta engine's "reverse
# anchored" optimization.
[[test]]
name = "no-match-at-start-reverse-anchored"
regex = '.c[a-z]$'
haystack = 'abcz'
matches = []
anchored = true
# Like above, but at a non-zero start offset.
[[test]]
name = "no-match-at-start-reverse-anchored-bounds"
regex = '.c[a-z]$'
haystack = 'aabcz'
bounds = [1, 5]
matches = []
anchored = true

235
vendor/regex/testdata/bytes.toml vendored Normal file
View File

@@ -0,0 +1,235 @@
# These are tests specifically crafted for regexes that can match arbitrary
# bytes. In some cases, we also test the Unicode variant as well, just because
# it's good sense to do so. But also, these tests aren't really about Unicode,
# but whether matches are only reported at valid UTF-8 boundaries. For most
# tests in this entire collection, utf8 = true. But for these tests, we use
# utf8 = false.
[[test]]
name = "word-boundary-ascii"
regex = ' \b'
haystack = " δ"
matches = []
unicode = false
utf8 = false
[[test]]
name = "word-boundary-unicode"
regex = ' \b'
haystack = " δ"
matches = [[0, 1]]
unicode = true
utf8 = false
[[test]]
name = "word-boundary-ascii-not"
regex = ' \B'
haystack = " δ"
matches = [[0, 1]]
unicode = false
utf8 = false
[[test]]
name = "word-boundary-unicode-not"
regex = ' \B'
haystack = " δ"
matches = []
unicode = true
utf8 = false
[[test]]
name = "perl-word-ascii"
regex = '\w+'
haystack = "aδ"
matches = [[0, 1]]
unicode = false
utf8 = false
[[test]]
name = "perl-word-unicode"
regex = '\w+'
haystack = "aδ"
matches = [[0, 3]]
unicode = true
utf8 = false
[[test]]
name = "perl-decimal-ascii"
regex = '\d+'
haystack = "1२३9"
matches = [[0, 1], [7, 8]]
unicode = false
utf8 = false
[[test]]
name = "perl-decimal-unicode"
regex = '\d+'
haystack = "1२३9"
matches = [[0, 8]]
unicode = true
utf8 = false
[[test]]
name = "perl-whitespace-ascii"
regex = '\s+'
haystack = " \u1680"
matches = [[0, 1]]
unicode = false
utf8 = false
[[test]]
name = "perl-whitespace-unicode"
regex = '\s+'
haystack = " \u1680"
matches = [[0, 4]]
unicode = true
utf8 = false
# The first `(.+)` matches two Unicode codepoints, but can't match the 5th
# byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
# matches.
[[test]]
name = "mixed-dot"
regex = '(.+)(?-u)(.+)'
haystack = '\xCE\x93\xCE\x94\xFF'
matches = [
[[0, 5], [0, 4], [4, 5]],
]
unescape = true
unicode = true
utf8 = false
[[test]]
name = "case-one-ascii"
regex = 'a'
haystack = "A"
matches = [[0, 1]]
case-insensitive = true
unicode = false
utf8 = false
[[test]]
name = "case-one-unicode"
regex = 'a'
haystack = "A"
matches = [[0, 1]]
case-insensitive = true
unicode = true
utf8 = false
[[test]]
name = "case-class-simple-ascii"
regex = '[a-z]+'
haystack = "AaAaA"
matches = [[0, 5]]
case-insensitive = true
unicode = false
utf8 = false
[[test]]
name = "case-class-ascii"
regex = '[a-z]+'
haystack = "aA\u212AaA"
matches = [[0, 2], [5, 7]]
case-insensitive = true
unicode = false
utf8 = false
[[test]]
name = "case-class-unicode"
regex = '[a-z]+'
haystack = "aA\u212AaA"
matches = [[0, 7]]
case-insensitive = true
unicode = true
utf8 = false
[[test]]
name = "negate-ascii"
regex = '[^a]'
haystack = "δ"
matches = [[0, 1], [1, 2]]
unicode = false
utf8 = false
[[test]]
name = "negate-unicode"
regex = '[^a]'
haystack = "δ"
matches = [[0, 2]]
unicode = true
utf8 = false
# When utf8=true, this won't match, because the implicit '.*?' prefix is
# Unicode aware and will refuse to match through invalid UTF-8 bytes.
[[test]]
name = "dotstar-prefix-ascii"
regex = 'a'
haystack = '\xFFa'
matches = [[1, 2]]
unescape = true
unicode = false
utf8 = false
[[test]]
name = "dotstar-prefix-unicode"
regex = 'a'
haystack = '\xFFa'
matches = [[1, 2]]
unescape = true
unicode = true
utf8 = false
[[test]]
name = "null-bytes"
regex = '(?P<cstr>[^\x00]+)\x00'
haystack = 'foo\x00'
matches = [
[[0, 4], [0, 3]],
]
unescape = true
unicode = false
utf8 = false
[[test]]
name = "invalid-utf8-anchor-100"
regex = '\xCC?^'
haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
matches = [[0, 0]]
unescape = true
unicode = false
utf8 = false
[[test]]
name = "invalid-utf8-anchor-200"
regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$'
haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
matches = [[22, 22]]
unescape = true
unicode = false
utf8 = false
[[test]]
name = "invalid-utf8-anchor-300"
regex = '^|ddp\xff\xffdddddlQd@\x80'
haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
matches = [[0, 0]]
unescape = true
unicode = false
utf8 = false
[[test]]
name = "word-boundary-ascii-100"
regex = '\Bx\B'
haystack = "áxβ"
matches = []
unicode = false
utf8 = false
[[test]]
name = "word-boundary-ascii-200"
regex = '\B'
haystack = "0\U0007EF5E"
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
unicode = false
utf8 = false

315
vendor/regex/testdata/crazy.toml vendored Normal file
View File

@@ -0,0 +1,315 @@
[[test]]
name = "nothing-empty"
regex = []
haystack = ""
matches = []
[[test]]
name = "nothing-something"
regex = []
haystack = "wat"
matches = []
[[test]]
name = "ranges"
regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
haystack = "num: 255"
matches = [[5, 8]]
[[test]]
name = "ranges-not"
regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
haystack = "num: 256"
matches = []
[[test]]
name = "float1"
regex = '[-+]?[0-9]*\.?[0-9]+'
haystack = "0.1"
matches = [[0, 3]]
[[test]]
name = "float2"
regex = '[-+]?[0-9]*\.?[0-9]+'
haystack = "0.1.2"
matches = [[0, 3]]
match-limit = 1
[[test]]
name = "float3"
regex = '[-+]?[0-9]*\.?[0-9]+'
haystack = "a1.2"
matches = [[1, 4]]
[[test]]
name = "float4"
regex = '[-+]?[0-9]*\.?[0-9]+'
haystack = "1.a"
matches = [[0, 1]]
[[test]]
name = "float5"
regex = '^[-+]?[0-9]*\.?[0-9]+$'
haystack = "1.a"
matches = []
[[test]]
name = "email"
regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
haystack = "mine is jam.slam@gmail.com "
matches = [[8, 26]]
[[test]]
name = "email-not"
regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
haystack = "mine is jam.slam@gmail "
matches = []
[[test]]
name = "email-big"
regex = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
haystack = "mine is jam.slam@gmail.com "
matches = [[8, 26]]
[[test]]
name = "date1"
regex = '^(?:19|20)\d\d[- /.](?:0[1-9]|1[012])[- /.](?:0[1-9]|[12][0-9]|3[01])$'
haystack = "1900-01-01"
matches = [[0, 10]]
unicode = false
[[test]]
name = "date2"
regex = '^(?:19|20)\d\d[- /.](?:0[1-9]|1[012])[- /.](?:0[1-9]|[12][0-9]|3[01])$'
haystack = "1900-00-01"
matches = []
unicode = false
[[test]]
name = "date3"
regex = '^(?:19|20)\d\d[- /.](?:0[1-9]|1[012])[- /.](?:0[1-9]|[12][0-9]|3[01])$'
haystack = "1900-13-01"
matches = []
unicode = false
[[test]]
name = "start-end-empty"
regex = '^$'
haystack = ""
matches = [[0, 0]]
[[test]]
name = "start-end-empty-rev"
regex = '$^'
haystack = ""
matches = [[0, 0]]
[[test]]
name = "start-end-empty-many-1"
regex = '^$^$^$'
haystack = ""
matches = [[0, 0]]
[[test]]
name = "start-end-empty-many-2"
regex = '^^^$$$'
haystack = ""
matches = [[0, 0]]
[[test]]
name = "start-end-empty-rep"
regex = '(?:^$)*'
haystack = "a\nb\nc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "start-end-empty-rep-rev"
regex = '(?:$^)*'
haystack = "a\nb\nc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "neg-class-letter"
regex = '[^ac]'
haystack = "acx"
matches = [[2, 3]]
[[test]]
name = "neg-class-letter-comma"
regex = '[^a,]'
haystack = "a,x"
matches = [[2, 3]]
[[test]]
name = "neg-class-letter-space"
regex = '[^a[:space:]]'
haystack = "a x"
matches = [[2, 3]]
[[test]]
name = "neg-class-comma"
regex = '[^,]'
haystack = ",,x"
matches = [[2, 3]]
[[test]]
name = "neg-class-space"
regex = '[^[:space:]]'
haystack = " a"
matches = [[1, 2]]
[[test]]
name = "neg-class-space-comma"
regex = '[^,[:space:]]'
haystack = ", a"
matches = [[2, 3]]
[[test]]
name = "neg-class-comma-space"
regex = '[^[:space:],]'
haystack = " ,a"
matches = [[2, 3]]
[[test]]
name = "neg-class-ascii"
regex = '[^[:alpha:]Z]'
haystack = "A1"
matches = [[1, 2]]
[[test]]
name = "lazy-many-many"
regex = '(?:(?:.*)*?)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "lazy-many-optional"
regex = '(?:(?:.?)*?)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "lazy-one-many-many"
regex = '(?:(?:.*)+?)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "lazy-one-many-optional"
regex = '(?:(?:.?)+?)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "lazy-range-min-many"
regex = '(?:(?:.*){1,}?)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "lazy-range-many"
regex = '(?:(?:.*){1,2}?)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "greedy-many-many"
regex = '(?:(?:.*)*)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "greedy-many-optional"
regex = '(?:(?:.?)*)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "greedy-one-many-many"
regex = '(?:(?:.*)+)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "greedy-one-many-optional"
regex = '(?:(?:.?)+)='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "greedy-range-min-many"
regex = '(?:(?:.*){1,})='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "greedy-range-many"
regex = '(?:(?:.*){1,2})='
haystack = "a=b"
matches = [[0, 2]]
[[test]]
name = "empty1"
regex = ''
haystack = ""
matches = [[0, 0]]
[[test]]
name = "empty2"
regex = ''
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty3"
regex = '(?:)'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty4"
regex = '(?:)*'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty5"
regex = '(?:)+'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty6"
regex = '(?:)?'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty7"
regex = '(?:)(?:)'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty8"
regex = '(?:)+|z'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty9"
regex = 'z|(?:)+'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty10"
regex = '(?:)+|b'
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty11"
regex = 'b|(?:)+'
haystack = "abc"
matches = [[0, 0], [1, 2], [3, 3]]

117
vendor/regex/testdata/crlf.toml vendored Normal file
View File

@@ -0,0 +1,117 @@
# This is a basic test that checks ^ and $ treat \r\n as a single line
# terminator. If ^ and $ only treated \n as a line terminator, then this would
# only match 'xyz' at the end of the haystack.
[[test]]
name = "basic"
regex = '(?mR)^[a-z]+$'
haystack = "abc\r\ndef\r\nxyz"
matches = [[0, 3], [5, 8], [10, 13]]
# Tests that a CRLF-aware '^$' assertion does not match between CR and LF.
[[test]]
name = "start-end-non-empty"
regex = '(?mR)^$'
haystack = "abc\r\ndef\r\nxyz"
matches = []
# Tests that a CRLF-aware '^$' assertion matches the empty string, just like
# a non-CRLF-aware '^$' assertion.
[[test]]
name = "start-end-empty"
regex = '(?mR)^$'
haystack = ""
matches = [[0, 0]]
# Tests that a CRLF-aware '^$' assertion matches the empty string preceding
# and following a line terminator.
[[test]]
name = "start-end-before-after"
regex = '(?mR)^$'
haystack = "\r\n"
matches = [[0, 0], [2, 2]]
# Tests that a CRLF-aware '^' assertion does not split a line terminator.
[[test]]
name = "start-no-split"
regex = '(?mR)^'
haystack = "abc\r\ndef\r\nxyz"
matches = [[0, 0], [5, 5], [10, 10]]
# Same as above, but with adjacent runs of line terminators.
[[test]]
name = "start-no-split-adjacent"
regex = '(?mR)^'
haystack = "\r\n\r\n\r\n"
matches = [[0, 0], [2, 2], [4, 4], [6, 6]]
# Same as above, but with adjacent runs of just carriage returns.
[[test]]
name = "start-no-split-adjacent-cr"
regex = '(?mR)^'
haystack = "\r\r\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
# Same as above, but with adjacent runs of just line feeds.
[[test]]
name = "start-no-split-adjacent-lf"
regex = '(?mR)^'
haystack = "\n\n\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
# Tests that a CRLF-aware '$' assertion does not split a line terminator.
[[test]]
name = "end-no-split"
regex = '(?mR)$'
haystack = "abc\r\ndef\r\nxyz"
matches = [[3, 3], [8, 8], [13, 13]]
# Same as above, but with adjacent runs of line terminators.
[[test]]
name = "end-no-split-adjacent"
regex = '(?mR)$'
haystack = "\r\n\r\n\r\n"
matches = [[0, 0], [2, 2], [4, 4], [6, 6]]
# Same as above, but with adjacent runs of just carriage returns.
[[test]]
name = "end-no-split-adjacent-cr"
regex = '(?mR)$'
haystack = "\r\r\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
# Same as above, but with adjacent runs of just line feeds.
[[test]]
name = "end-no-split-adjacent-lf"
regex = '(?mR)$'
haystack = "\n\n\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
# Tests that '.' does not match either \r or \n when CRLF mode is enabled. Note
# that this doesn't require multi-line mode to be enabled.
[[test]]
name = "dot-no-crlf"
regex = '(?R).'
haystack = "\r\n\r\n\r\n"
matches = []
# This is a test that caught a bug in the one-pass DFA where it (amazingly) was
# using 'is_end_lf' instead of 'is_end_crlf' here. It was probably a copy &
# paste bug. We insert an empty capture group here because it provokes the meta
# regex engine to first find a match and then trip over a panic because the
# one-pass DFA erroneously says there is no match.
[[test]]
name = "onepass-wrong-crlf-with-capture"
regex = '(?Rm:().$)'
haystack = "ZZ\r"
matches = [[[1, 2], [1, 1]]]
# This is like onepass-wrong-crlf-with-capture above, except it sets up the
# test so that it can be run by the one-pass DFA directly. (i.e., Make it
# anchored and start the search at the right place.)
[[test]]
name = "onepass-wrong-crlf-anchored"
regex = '(?Rm:.$)'
haystack = "ZZ\r"
matches = [[1, 2]]
anchored = true
bounds = [1, 3]

52
vendor/regex/testdata/earliest.toml vendored Normal file
View File

@@ -0,0 +1,52 @@
[[test]]
name = "no-greedy-100"
regex = 'a+'
haystack = "aaa"
matches = [[0, 1], [1, 2], [2, 3]]
search-kind = "earliest"
[[test]]
name = "no-greedy-200"
regex = 'abc+'
haystack = "zzzabccc"
matches = [[3, 6]]
search-kind = "earliest"
[[test]]
name = "is-ungreedy"
regex = 'a+?'
haystack = "aaa"
matches = [[0, 1], [1, 2], [2, 3]]
search-kind = "earliest"
[[test]]
name = "look-start-test"
regex = '^(abc|a)'
haystack = "abc"
matches = [
[[0, 1], [0, 1]],
]
search-kind = "earliest"
[[test]]
name = "look-end-test"
regex = '(abc|a)$'
haystack = "abc"
matches = [
[[0, 3], [0, 3]],
]
search-kind = "earliest"
[[test]]
name = "no-leftmost-first-100"
regex = 'abc|a'
haystack = "abc"
matches = [[0, 1]]
search-kind = "earliest"
[[test]]
name = "no-leftmost-first-200"
regex = 'aba|a'
haystack = "aba"
matches = [[0, 1], [2, 3]]
search-kind = "earliest"

113
vendor/regex/testdata/empty.toml vendored Normal file
View File

@@ -0,0 +1,113 @@
[[test]]
name = "100"
regex = "|b"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "110"
regex = "b|"
haystack = "abc"
matches = [[0, 0], [1, 2], [3, 3]]
[[test]]
name = "120"
regex = "|z"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "130"
regex = "z|"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "200"
regex = "|"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "210"
regex = "||"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "220"
regex = "||b"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "230"
regex = "b||"
haystack = "abc"
matches = [[0, 0], [1, 2], [3, 3]]
[[test]]
name = "240"
regex = "||z"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "300"
regex = "(?:)|b"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "310"
regex = "b|(?:)"
haystack = "abc"
matches = [[0, 0], [1, 2], [3, 3]]
[[test]]
name = "320"
regex = "(?:|)"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "330"
regex = "(?:|)|z"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "400"
regex = "a(?:)|b"
haystack = "abc"
matches = [[0, 1], [1, 2]]
[[test]]
name = "500"
regex = ""
haystack = ""
matches = [[0, 0]]
[[test]]
name = "510"
regex = ""
haystack = "a"
matches = [[0, 0], [1, 1]]
[[test]]
name = "520"
regex = ""
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "600"
regex = '(?:|a)*'
haystack = "aaa"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "610"
regex = '(?:|a)+'
haystack = "aaa"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]

23
vendor/regex/testdata/expensive.toml vendored Normal file
View File

@@ -0,0 +1,23 @@
# This file represent tests that may be expensive to run on some regex engines.
# For example, tests that build a full DFA ahead of time and minimize it can
# take a horrendously long time on regexes that are large (or result in an
# explosion in the number of states). We group these tests together so that
# such engines can simply skip these tests.
# See: https://github.com/rust-lang/regex/issues/98
[[test]]
name = "regression-many-repeat-no-stack-overflow"
regex = '^.{1,2500}'
haystack = "a"
matches = [[0, 1]]
# This test is meant to blow the bounded backtracker's visited capacity. In
# order to do that, we need a somewhat sizeable regex. The purpose of this
# is to make sure there's at least one test that exercises this path in the
# backtracker. All other tests (at time of writing) are small enough that the
# backtracker can handle them fine.
[[test]]
name = "backtrack-blow-visited-capacity"
regex = '\pL{50}'
haystack = "abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyZZ"
matches = [[0, 50], [50, 100], [100, 150]]

68
vendor/regex/testdata/flags.toml vendored Normal file
View File

@@ -0,0 +1,68 @@
[[test]]
name = "1"
regex = "(?i)abc"
haystack = "ABC"
matches = [[0, 3]]
[[test]]
name = "2"
regex = "(?i)a(?-i)bc"
haystack = "Abc"
matches = [[0, 3]]
[[test]]
name = "3"
regex = "(?i)a(?-i)bc"
haystack = "ABC"
matches = []
[[test]]
name = "4"
regex = "(?is)a."
haystack = "A\n"
matches = [[0, 2]]
[[test]]
name = "5"
regex = "(?is)a.(?-is)a."
haystack = "A\nab"
matches = [[0, 4]]
[[test]]
name = "6"
regex = "(?is)a.(?-is)a."
haystack = "A\na\n"
matches = []
[[test]]
name = "7"
regex = "(?is)a.(?-is:a.)?"
haystack = "A\na\n"
matches = [[0, 2]]
match-limit = 1
[[test]]
name = "8"
regex = "(?U)a+"
haystack = "aa"
matches = [[0, 1]]
match-limit = 1
[[test]]
name = "9"
regex = "(?U)a+?"
haystack = "aa"
matches = [[0, 2]]
[[test]]
name = "10"
regex = "(?U)(?-U)a+"
haystack = "aa"
matches = [[0, 2]]
[[test]]
name = "11"
regex = '(?m)(?:^\d+$\n?)+'
haystack = "123\n456\n789"
matches = [[0, 11]]
unicode = false

143
vendor/regex/testdata/iter.toml vendored Normal file
View File

@@ -0,0 +1,143 @@
[[test]]
name = "1"
regex = "a"
haystack = "aaa"
matches = [[0, 1], [1, 2], [2, 3]]
[[test]]
name = "2"
regex = "a"
haystack = "aba"
matches = [[0, 1], [2, 3]]
[[test]]
name = "empty1"
regex = ''
haystack = ''
matches = [[0, 0]]
[[test]]
name = "empty2"
regex = ''
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty3"
regex = '(?:)'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty4"
regex = '(?:)*'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty5"
regex = '(?:)+'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty6"
regex = '(?:)?'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty7"
regex = '(?:)(?:)'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty8"
regex = '(?:)+|z'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty9"
regex = 'z|(?:)+'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty10"
regex = '(?:)+|b'
haystack = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[test]]
name = "empty11"
regex = 'b|(?:)+'
haystack = 'abc'
matches = [[0, 0], [1, 2], [3, 3]]
[[test]]
name = "start1"
regex = "^a"
haystack = "a"
matches = [[0, 1]]
[[test]]
name = "start2"
regex = "^a"
haystack = "aa"
matches = [[0, 1]]
[[test]]
name = "anchored1"
regex = "a"
haystack = "a"
matches = [[0, 1]]
anchored = true
# This test is pretty subtle. It demonstrates the crucial difference between
# '^a' and 'a' compiled in 'anchored' mode. The former regex exclusively
# matches at the start of a haystack and nowhere else. The latter regex has
# no such restriction, but its automaton is constructed such that it lacks a
# `.*?` prefix. So it can actually produce matches at multiple locations.
# The anchored3 test drives this point home.
[[test]]
name = "anchored2"
regex = "a"
haystack = "aa"
matches = [[0, 1], [1, 2]]
anchored = true
# Unlikely anchored2, this test stops matching anything after it sees `b`
# since it lacks a `.*?` prefix. Since it is looking for 'a' but sees 'b', it
# determines that there are no remaining matches.
[[test]]
name = "anchored3"
regex = "a"
haystack = "aaba"
matches = [[0, 1], [1, 2]]
anchored = true
[[test]]
name = "nonempty-followedby-empty"
regex = 'abc|.*?'
haystack = "abczzz"
matches = [[0, 3], [4, 4], [5, 5], [6, 6]]
[[test]]
name = "nonempty-followedby-oneempty"
regex = 'abc|.*?'
haystack = "abcz"
matches = [[0, 3], [4, 4]]
[[test]]
name = "nonempty-followedby-onemixed"
regex = 'abc|.*?'
haystack = "abczabc"
matches = [[0, 3], [4, 7]]
[[test]]
name = "nonempty-followedby-twomixed"
regex = 'abc|.*?'
haystack = "abczzabc"
matches = [[0, 3], [4, 4], [5, 8]]

25
vendor/regex/testdata/leftmost-all.toml vendored Normal file
View File

@@ -0,0 +1,25 @@
[[test]]
name = "alt"
regex = 'foo|foobar'
haystack = "foobar"
matches = [[0, 6]]
match-kind = "all"
search-kind = "leftmost"
[[test]]
name = "multi"
regex = ['foo', 'foobar']
haystack = "foobar"
matches = [
{ id = 1, span = [0, 6] },
]
match-kind = "all"
search-kind = "leftmost"
[[test]]
name = "dotall"
regex = '(?s:.)'
haystack = "foobar"
matches = [[5, 6]]
match-kind = "all"
search-kind = "leftmost"

View File

@@ -0,0 +1,109 @@
# This tests that we can switch the line terminator to the NUL byte.
[[test]]
name = "nul"
regex = '(?m)^[a-z]+$'
haystack = '\x00abc\x00'
matches = [[1, 4]]
unescape = true
line-terminator = '\x00'
# This tests that '.' will not match the configured line terminator, but will
# match \n.
[[test]]
name = "dot-changes-with-line-terminator"
regex = '.'
haystack = '\x00\n'
matches = [[1, 2]]
unescape = true
line-terminator = '\x00'
# This tests that when we switch the line terminator, \n is no longer
# recognized as the terminator.
[[test]]
name = "not-line-feed"
regex = '(?m)^[a-z]+$'
haystack = '\nabc\n'
matches = []
unescape = true
line-terminator = '\x00'
# This tests that we can set the line terminator to a non-ASCII byte and have
# it behave as expected.
[[test]]
name = "non-ascii"
regex = '(?m)^[a-z]+$'
haystack = '\xFFabc\xFF'
matches = [[1, 4]]
unescape = true
line-terminator = '\xFF'
utf8 = false
# This tests a tricky case where the line terminator is set to \r. This ensures
# that the StartLF look-behind assertion is tracked when computing the start
# state.
[[test]]
name = "carriage"
regex = '(?m)^[a-z]+'
haystack = 'ABC\rabc'
matches = [[4, 7]]
bounds = [4, 7]
unescape = true
line-terminator = '\r'
# This tests that we can set the line terminator to a byte corresponding to a
# word character, and things work as expected.
[[test]]
name = "word-byte"
regex = '(?m)^[a-z]+$'
haystack = 'ZabcZ'
matches = [[1, 4]]
unescape = true
line-terminator = 'Z'
# This tests that we can set the line terminator to a byte corresponding to a
# non-word character, and things work as expected.
[[test]]
name = "non-word-byte"
regex = '(?m)^[a-z]+$'
haystack = '%abc%'
matches = [[1, 4]]
unescape = true
line-terminator = '%'
# This combines "set line terminator to a word byte" with a word boundary
# assertion, which should result in no match even though ^/$ matches.
[[test]]
name = "word-boundary"
regex = '(?m)^\b[a-z]+\b$'
haystack = 'ZabcZ'
matches = []
unescape = true
line-terminator = 'Z'
# Like 'word-boundary', but does an anchored search at the point where ^
# matches, but where \b should not.
[[test]]
name = "word-boundary-at"
regex = '(?m)^\b[a-z]+\b$'
haystack = 'ZabcZ'
matches = []
bounds = [1, 4]
anchored = true
unescape = true
line-terminator = 'Z'
# Like 'word-boundary-at', but flips the word boundary to a negation. This
# in particular tests a tricky case in DFA engines, where they must consider
# explicitly that a starting configuration from a custom line terminator may
# also required setting the "is from word byte" flag on a state. Otherwise,
# it's treated as "not from a word byte," which would result in \B not matching
# here when it should.
[[test]]
name = "not-word-boundary-at"
regex = '(?m)^\B[a-z]+\B$'
haystack = 'ZabcZ'
matches = [[1, 4]]
bounds = [1, 4]
anchored = true
unescape = true
line-terminator = 'Z'

99
vendor/regex/testdata/misc.toml vendored Normal file
View File

@@ -0,0 +1,99 @@
[[test]]
name = "ascii-literal"
regex = "a"
haystack = "a"
matches = [[0, 1]]
[[test]]
name = "ascii-literal-not"
regex = "a"
haystack = "z"
matches = []
[[test]]
name = "ascii-literal-anchored"
regex = "a"
haystack = "a"
matches = [[0, 1]]
anchored = true
[[test]]
name = "ascii-literal-anchored-not"
regex = "a"
haystack = "z"
matches = []
anchored = true
[[test]]
name = "anchor-start-end-line"
regex = '(?m)^bar$'
haystack = "foo\nbar\nbaz"
matches = [[4, 7]]
[[test]]
name = "prefix-literal-match"
regex = '^abc'
haystack = "abc"
matches = [[0, 3]]
[[test]]
name = "prefix-literal-match-ascii"
regex = '^abc'
haystack = "abc"
matches = [[0, 3]]
unicode = false
utf8 = false
[[test]]
name = "prefix-literal-no-match"
regex = '^abc'
haystack = "zabc"
matches = []
[[test]]
name = "one-literal-edge"
regex = 'abc'
haystack = "xxxxxab"
matches = []
[[test]]
name = "terminates"
regex = 'a$'
haystack = "a"
matches = [[0, 1]]
[[test]]
name = "suffix-100"
regex = '.*abcd'
haystack = "abcd"
matches = [[0, 4]]
[[test]]
name = "suffix-200"
regex = '.*(?:abcd)+'
haystack = "abcd"
matches = [[0, 4]]
[[test]]
name = "suffix-300"
regex = '.*(?:abcd)+'
haystack = "abcdabcd"
matches = [[0, 8]]
[[test]]
name = "suffix-400"
regex = '.*(?:abcd)+'
haystack = "abcdxabcd"
matches = [[0, 9]]
[[test]]
name = "suffix-500"
regex = '.*x(?:abcd)+'
haystack = "abcdxabcd"
matches = [[0, 9]]
[[test]]
name = "suffix-600"
regex = '[^abcd]*x(?:abcd)+'
haystack = "abcdxabcd"
matches = [[4, 9]]

845
vendor/regex/testdata/multiline.toml vendored Normal file
View File

@@ -0,0 +1,845 @@
[[test]]
name = "basic1"
regex = '(?m)^[a-z]+$'
haystack = "abc\ndef\nxyz"
matches = [[0, 3], [4, 7], [8, 11]]
[[test]]
name = "basic1-crlf"
regex = '(?Rm)^[a-z]+$'
haystack = "abc\ndef\nxyz"
matches = [[0, 3], [4, 7], [8, 11]]
[[test]]
name = "basic1-crlf-cr"
regex = '(?Rm)^[a-z]+$'
haystack = "abc\rdef\rxyz"
matches = [[0, 3], [4, 7], [8, 11]]
[[test]]
name = "basic2"
regex = '(?m)^$'
haystack = "abc\ndef\nxyz"
matches = []
[[test]]
name = "basic2-crlf"
regex = '(?Rm)^$'
haystack = "abc\ndef\nxyz"
matches = []
[[test]]
name = "basic2-crlf-cr"
regex = '(?Rm)^$'
haystack = "abc\rdef\rxyz"
matches = []
[[test]]
name = "basic3"
regex = '(?m)^'
haystack = "abc\ndef\nxyz"
matches = [[0, 0], [4, 4], [8, 8]]
[[test]]
name = "basic3-crlf"
regex = '(?Rm)^'
haystack = "abc\ndef\nxyz"
matches = [[0, 0], [4, 4], [8, 8]]
[[test]]
name = "basic3-crlf-cr"
regex = '(?Rm)^'
haystack = "abc\rdef\rxyz"
matches = [[0, 0], [4, 4], [8, 8]]
[[test]]
name = "basic4"
regex = '(?m)$'
haystack = "abc\ndef\nxyz"
matches = [[3, 3], [7, 7], [11, 11]]
[[test]]
name = "basic4-crlf"
regex = '(?Rm)$'
haystack = "abc\ndef\nxyz"
matches = [[3, 3], [7, 7], [11, 11]]
[[test]]
name = "basic4-crlf-cr"
regex = '(?Rm)$'
haystack = "abc\rdef\rxyz"
matches = [[3, 3], [7, 7], [11, 11]]
[[test]]
name = "basic5"
regex = '(?m)^[a-z]'
haystack = "abc\ndef\nxyz"
matches = [[0, 1], [4, 5], [8, 9]]
[[test]]
name = "basic5-crlf"
regex = '(?Rm)^[a-z]'
haystack = "abc\ndef\nxyz"
matches = [[0, 1], [4, 5], [8, 9]]
[[test]]
name = "basic5-crlf-cr"
regex = '(?Rm)^[a-z]'
haystack = "abc\rdef\rxyz"
matches = [[0, 1], [4, 5], [8, 9]]
[[test]]
name = "basic6"
regex = '(?m)[a-z]^'
haystack = "abc\ndef\nxyz"
matches = []
[[test]]
name = "basic6-crlf"
regex = '(?Rm)[a-z]^'
haystack = "abc\ndef\nxyz"
matches = []
[[test]]
name = "basic6-crlf-cr"
regex = '(?Rm)[a-z]^'
haystack = "abc\rdef\rxyz"
matches = []
[[test]]
name = "basic7"
regex = '(?m)[a-z]$'
haystack = "abc\ndef\nxyz"
matches = [[2, 3], [6, 7], [10, 11]]
[[test]]
name = "basic7-crlf"
regex = '(?Rm)[a-z]$'
haystack = "abc\ndef\nxyz"
matches = [[2, 3], [6, 7], [10, 11]]
[[test]]
name = "basic7-crlf-cr"
regex = '(?Rm)[a-z]$'
haystack = "abc\rdef\rxyz"
matches = [[2, 3], [6, 7], [10, 11]]
[[test]]
name = "basic8"
regex = '(?m)$[a-z]'
haystack = "abc\ndef\nxyz"
matches = []
[[test]]
name = "basic8-crlf"
regex = '(?Rm)$[a-z]'
haystack = "abc\ndef\nxyz"
matches = []
[[test]]
name = "basic8-crlf-cr"
regex = '(?Rm)$[a-z]'
haystack = "abc\rdef\rxyz"
matches = []
[[test]]
name = "basic9"
regex = '(?m)^$'
haystack = ""
matches = [[0, 0]]
[[test]]
name = "basic9-crlf"
regex = '(?Rm)^$'
haystack = ""
matches = [[0, 0]]
[[test]]
name = "repeat1"
regex = '(?m)(?:^$)*'
haystack = "a\nb\nc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "repeat1-crlf"
regex = '(?Rm)(?:^$)*'
haystack = "a\nb\nc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "repeat1-crlf-cr"
regex = '(?Rm)(?:^$)*'
haystack = "a\rb\rc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "repeat1-no-multi"
regex = '(?:^$)*'
haystack = "a\nb\nc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "repeat1-no-multi-crlf"
regex = '(?R)(?:^$)*'
haystack = "a\nb\nc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "repeat1-no-multi-crlf-cr"
regex = '(?R)(?:^$)*'
haystack = "a\rb\rc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
[[test]]
name = "repeat2"
regex = '(?m)(?:^|a)+'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat2-crlf"
regex = '(?Rm)(?:^|a)+'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat2-crlf-cr"
regex = '(?Rm)(?:^|a)+'
haystack = "a\raaa\r"
matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat2-no-multi"
regex = '(?:^|a)+'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 5]]
[[test]]
name = "repeat2-no-multi-crlf"
regex = '(?R)(?:^|a)+'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 5]]
[[test]]
name = "repeat2-no-multi-crlf-cr"
regex = '(?R)(?:^|a)+'
haystack = "a\raaa\r"
matches = [[0, 0], [2, 5]]
[[test]]
name = "repeat3"
regex = '(?m)(?:^|a)*'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat3-crlf"
regex = '(?Rm)(?:^|a)*'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat3-crlf-cr"
regex = '(?Rm)(?:^|a)*'
haystack = "a\raaa\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat3-no-multi"
regex = '(?:^|a)*'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
[[test]]
name = "repeat3-no-multi-crlf"
regex = '(?R)(?:^|a)*'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
[[test]]
name = "repeat3-no-multi-crlf-cr"
regex = '(?R)(?:^|a)*'
haystack = "a\raaa\r"
matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
[[test]]
name = "repeat4"
regex = '(?m)(?:^|a+)'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat4-crlf"
regex = '(?Rm)(?:^|a+)'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat4-crlf-cr"
regex = '(?Rm)(?:^|a+)'
haystack = "a\raaa\r"
matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat4-no-multi"
regex = '(?:^|a+)'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 5]]
[[test]]
name = "repeat4-no-multi-crlf"
regex = '(?R)(?:^|a+)'
haystack = "a\naaa\n"
matches = [[0, 0], [2, 5]]
[[test]]
name = "repeat4-no-multi-crlf-cr"
regex = '(?R)(?:^|a+)'
haystack = "a\raaa\r"
matches = [[0, 0], [2, 5]]
[[test]]
name = "repeat5"
regex = '(?m)(?:^|a*)'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat5-crlf"
regex = '(?Rm)(?:^|a*)'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat5-crlf-cr"
regex = '(?Rm)(?:^|a*)'
haystack = "a\raaa\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
[[test]]
name = "repeat5-no-multi"
regex = '(?:^|a*)'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
[[test]]
name = "repeat5-no-multi-crlf"
regex = '(?R)(?:^|a*)'
haystack = "a\naaa\n"
matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
[[test]]
name = "repeat5-no-multi-crlf-cr"
regex = '(?R)(?:^|a*)'
haystack = "a\raaa\r"
matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
[[test]]
name = "repeat6"
regex = '(?m)(?:^[a-z])+'
haystack = "abc\ndef\nxyz"
matches = [[0, 1], [4, 5], [8, 9]]
[[test]]
name = "repeat6-crlf"
regex = '(?Rm)(?:^[a-z])+'
haystack = "abc\ndef\nxyz"
matches = [[0, 1], [4, 5], [8, 9]]
[[test]]
name = "repeat6-crlf-cr"
regex = '(?Rm)(?:^[a-z])+'
haystack = "abc\rdef\rxyz"
matches = [[0, 1], [4, 5], [8, 9]]
[[test]]
name = "repeat6-no-multi"
regex = '(?:^[a-z])+'
haystack = "abc\ndef\nxyz"
matches = [[0, 1]]
[[test]]
name = "repeat6-no-multi-crlf"
regex = '(?R)(?:^[a-z])+'
haystack = "abc\ndef\nxyz"
matches = [[0, 1]]
[[test]]
name = "repeat6-no-multi-crlf-cr"
regex = '(?R)(?:^[a-z])+'
haystack = "abc\rdef\rxyz"
matches = [[0, 1]]
[[test]]
name = "repeat7"
regex = '(?m)(?:^[a-z]{3}\n?)+'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat7-crlf"
regex = '(?Rm)(?:^[a-z]{3}\n?)+'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat7-crlf-cr"
regex = '(?Rm)(?:^[a-z]{3}\r?)+'
haystack = "abc\rdef\rxyz"
matches = [[0, 11]]
[[test]]
name = "repeat7-no-multi"
regex = '(?:^[a-z]{3}\n?)+'
haystack = "abc\ndef\nxyz"
matches = [[0, 4]]
[[test]]
name = "repeat7-no-multi-crlf"
regex = '(?R)(?:^[a-z]{3}\n?)+'
haystack = "abc\ndef\nxyz"
matches = [[0, 4]]
[[test]]
name = "repeat7-no-multi-crlf-cr"
regex = '(?R)(?:^[a-z]{3}\r?)+'
haystack = "abc\rdef\rxyz"
matches = [[0, 4]]
[[test]]
name = "repeat8"
regex = '(?m)(?:^[a-z]{3}\n?)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat8-crlf"
regex = '(?Rm)(?:^[a-z]{3}\n?)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat8-crlf-cr"
regex = '(?Rm)(?:^[a-z]{3}\r?)*'
haystack = "abc\rdef\rxyz"
matches = [[0, 11]]
[[test]]
name = "repeat8-no-multi"
regex = '(?:^[a-z]{3}\n?)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]]
[[test]]
name = "repeat8-no-multi-crlf"
regex = '(?R)(?:^[a-z]{3}\n?)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]]
[[test]]
name = "repeat8-no-multi-crlf-cr"
regex = '(?R)(?:^[a-z]{3}\r?)*'
haystack = "abc\rdef\rxyz"
matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]]
[[test]]
name = "repeat9"
regex = '(?m)(?:\n?[a-z]{3}$)+'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat9-crlf"
regex = '(?Rm)(?:\n?[a-z]{3}$)+'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat9-crlf-cr"
regex = '(?Rm)(?:\r?[a-z]{3}$)+'
haystack = "abc\rdef\rxyz"
matches = [[0, 11]]
[[test]]
name = "repeat9-no-multi"
regex = '(?:\n?[a-z]{3}$)+'
haystack = "abc\ndef\nxyz"
matches = [[7, 11]]
[[test]]
name = "repeat9-no-multi-crlf"
regex = '(?R)(?:\n?[a-z]{3}$)+'
haystack = "abc\ndef\nxyz"
matches = [[7, 11]]
[[test]]
name = "repeat9-no-multi-crlf-cr"
regex = '(?R)(?:\r?[a-z]{3}$)+'
haystack = "abc\rdef\rxyz"
matches = [[7, 11]]
[[test]]
name = "repeat10"
regex = '(?m)(?:\n?[a-z]{3}$)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat10-crlf"
regex = '(?Rm)(?:\n?[a-z]{3}$)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 11]]
[[test]]
name = "repeat10-crlf-cr"
regex = '(?Rm)(?:\r?[a-z]{3}$)*'
haystack = "abc\rdef\rxyz"
matches = [[0, 11]]
[[test]]
name = "repeat10-no-multi"
regex = '(?:\n?[a-z]{3}$)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]]
[[test]]
name = "repeat10-no-multi-crlf"
regex = '(?R)(?:\n?[a-z]{3}$)*'
haystack = "abc\ndef\nxyz"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]]
[[test]]
name = "repeat10-no-multi-crlf-cr"
regex = '(?R)(?:\r?[a-z]{3}$)*'
haystack = "abc\rdef\rxyz"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]]
[[test]]
name = "repeat11"
regex = '(?m)^*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat11-crlf"
regex = '(?Rm)^*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat11-crlf-cr"
regex = '(?Rm)^*'
haystack = "\raa\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat11-no-multi"
regex = '^*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat11-no-multi-crlf"
regex = '(?R)^*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat11-no-multi-crlf-cr"
regex = '(?R)^*'
haystack = "\raa\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat12"
regex = '(?m)^+'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [4, 4]]
[[test]]
name = "repeat12-crlf"
regex = '(?Rm)^+'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [4, 4]]
[[test]]
name = "repeat12-crlf-cr"
regex = '(?Rm)^+'
haystack = "\raa\r"
matches = [[0, 0], [1, 1], [4, 4]]
[[test]]
name = "repeat12-no-multi"
regex = '^+'
haystack = "\naa\n"
matches = [[0, 0]]
[[test]]
name = "repeat12-no-multi-crlf"
regex = '(?R)^+'
haystack = "\naa\n"
matches = [[0, 0]]
[[test]]
name = "repeat12-no-multi-crlf-cr"
regex = '(?R)^+'
haystack = "\raa\r"
matches = [[0, 0]]
[[test]]
name = "repeat13"
regex = '(?m)$*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat13-crlf"
regex = '(?Rm)$*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat13-crlf-cr"
regex = '(?Rm)$*'
haystack = "\raa\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat13-no-multi"
regex = '$*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat13-no-multi-crlf"
regex = '(?R)$*'
haystack = "\naa\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat13-no-multi-crlf-cr"
regex = '(?R)$*'
haystack = "\raa\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
[[test]]
name = "repeat14"
regex = '(?m)$+'
haystack = "\naa\n"
matches = [[0, 0], [3, 3], [4, 4]]
[[test]]
name = "repeat14-crlf"
regex = '(?Rm)$+'
haystack = "\naa\n"
matches = [[0, 0], [3, 3], [4, 4]]
[[test]]
name = "repeat14-crlf-cr"
regex = '(?Rm)$+'
haystack = "\raa\r"
matches = [[0, 0], [3, 3], [4, 4]]
[[test]]
name = "repeat14-no-multi"
regex = '$+'
haystack = "\naa\n"
matches = [[4, 4]]
[[test]]
name = "repeat14-no-multi-crlf"
regex = '(?R)$+'
haystack = "\naa\n"
matches = [[4, 4]]
[[test]]
name = "repeat14-no-multi-crlf-cr"
regex = '(?R)$+'
haystack = "\raa\r"
matches = [[4, 4]]
[[test]]
name = "repeat15"
regex = '(?m)(?:$\n)+'
haystack = "\n\naaa\n\n"
matches = [[0, 2], [5, 7]]
[[test]]
name = "repeat15-crlf"
regex = '(?Rm)(?:$\n)+'
haystack = "\n\naaa\n\n"
matches = [[0, 2], [5, 7]]
[[test]]
name = "repeat15-crlf-cr"
regex = '(?Rm)(?:$\r)+'
haystack = "\r\raaa\r\r"
matches = [[0, 2], [5, 7]]
[[test]]
name = "repeat15-no-multi"
regex = '(?:$\n)+'
haystack = "\n\naaa\n\n"
matches = []
[[test]]
name = "repeat15-no-multi-crlf"
regex = '(?R)(?:$\n)+'
haystack = "\n\naaa\n\n"
matches = []
[[test]]
name = "repeat15-no-multi-crlf-cr"
regex = '(?R)(?:$\r)+'
haystack = "\r\raaa\r\r"
matches = []
[[test]]
name = "repeat16"
regex = '(?m)(?:$\n)*'
haystack = "\n\naaa\n\n"
matches = [[0, 2], [3, 3], [4, 4], [5, 7]]
[[test]]
name = "repeat16-crlf"
regex = '(?Rm)(?:$\n)*'
haystack = "\n\naaa\n\n"
matches = [[0, 2], [3, 3], [4, 4], [5, 7]]
[[test]]
name = "repeat16-crlf-cr"
regex = '(?Rm)(?:$\r)*'
haystack = "\r\raaa\r\r"
matches = [[0, 2], [3, 3], [4, 4], [5, 7]]
[[test]]
name = "repeat16-no-multi"
regex = '(?:$\n)*'
haystack = "\n\naaa\n\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]
[[test]]
name = "repeat16-no-multi-crlf"
regex = '(?R)(?:$\n)*'
haystack = "\n\naaa\n\n"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]
[[test]]
name = "repeat16-no-multi-crlf-cr"
regex = '(?R)(?:$\r)*'
haystack = "\r\raaa\r\r"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]
[[test]]
name = "repeat17"
regex = '(?m)(?:$\n^)+'
haystack = "\n\naaa\n\n"
matches = [[0, 2], [5, 7]]
[[test]]
name = "repeat17-crlf"
regex = '(?Rm)(?:$\n^)+'
haystack = "\n\naaa\n\n"
matches = [[0, 2], [5, 7]]
[[test]]
name = "repeat17-crlf-cr"
regex = '(?Rm)(?:$\r^)+'
haystack = "\r\raaa\r\r"
matches = [[0, 2], [5, 7]]
[[test]]
name = "repeat17-no-multi"
regex = '(?:$\n^)+'
haystack = "\n\naaa\n\n"
matches = []
[[test]]
name = "repeat17-no-multi-crlf"
regex = '(?R)(?:$\n^)+'
haystack = "\n\naaa\n\n"
matches = []
[[test]]
name = "repeat17-no-multi-crlf-cr"
regex = '(?R)(?:$\r^)+'
haystack = "\r\raaa\r\r"
matches = []
[[test]]
name = "repeat18"
regex = '(?m)(?:^|$)+'
haystack = "\n\naaa\n\n"
matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]]
[[test]]
name = "repeat18-crlf"
regex = '(?Rm)(?:^|$)+'
haystack = "\n\naaa\n\n"
matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]]
[[test]]
name = "repeat18-crlf-cr"
regex = '(?Rm)(?:^|$)+'
haystack = "\r\raaa\r\r"
matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]]
[[test]]
name = "repeat18-no-multi"
regex = '(?:^|$)+'
haystack = "\n\naaa\n\n"
matches = [[0, 0], [7, 7]]
[[test]]
name = "repeat18-no-multi-crlf"
regex = '(?R)(?:^|$)+'
haystack = "\n\naaa\n\n"
matches = [[0, 0], [7, 7]]
[[test]]
name = "repeat18-no-multi-crlf-cr"
regex = '(?R)(?:^|$)+'
haystack = "\r\raaa\r\r"
matches = [[0, 0], [7, 7]]
[[test]]
name = "match-line-100"
regex = '(?m)^.+$'
haystack = "aa\naaaaaaaaaaaaaaaaaaa\n"
matches = [[0, 2], [3, 22]]
[[test]]
name = "match-line-100-crlf"
regex = '(?Rm)^.+$'
haystack = "aa\naaaaaaaaaaaaaaaaaaa\n"
matches = [[0, 2], [3, 22]]
[[test]]
name = "match-line-100-crlf-cr"
regex = '(?Rm)^.+$'
haystack = "aa\raaaaaaaaaaaaaaaaaaa\r"
matches = [[0, 2], [3, 22]]
[[test]]
name = "match-line-200"
regex = '(?m)^.+$'
haystack = "aa\naaaaaaaaaaaaaaaaaaa\n"
matches = [[0, 2], [3, 22]]
unicode = false
utf8 = false
[[test]]
name = "match-line-200-crlf"
regex = '(?Rm)^.+$'
haystack = "aa\naaaaaaaaaaaaaaaaaaa\n"
matches = [[0, 2], [3, 22]]
unicode = false
utf8 = false
[[test]]
name = "match-line-200-crlf-cr"
regex = '(?Rm)^.+$'
haystack = "aa\raaaaaaaaaaaaaaaaaaa\r"
matches = [[0, 2], [3, 22]]
unicode = false
utf8 = false

222
vendor/regex/testdata/no-unicode.toml vendored Normal file
View File

@@ -0,0 +1,222 @@
[[test]]
name = "invalid-utf8-literal1"
regex = '\xFF'
haystack = '\xFF'
matches = [[0, 1]]
unicode = false
utf8 = false
unescape = true
[[test]]
name = "mixed"
regex = '(?:.+)(?-u)(?:.+)'
haystack = '\xCE\x93\xCE\x94\xFF'
matches = [[0, 5]]
utf8 = false
unescape = true
[[test]]
name = "case1"
regex = "a"
haystack = "A"
matches = [[0, 1]]
case-insensitive = true
unicode = false
[[test]]
name = "case2"
regex = "[a-z]+"
haystack = "AaAaA"
matches = [[0, 5]]
case-insensitive = true
unicode = false
[[test]]
name = "case3"
regex = "[a-z]+"
haystack = "aA\u212AaA"
matches = [[0, 7]]
case-insensitive = true
[[test]]
name = "case4"
regex = "[a-z]+"
haystack = "aA\u212AaA"
matches = [[0, 2], [5, 7]]
case-insensitive = true
unicode = false
[[test]]
name = "negate1"
regex = "[^a]"
haystack = "δ"
matches = [[0, 2]]
[[test]]
name = "negate2"
regex = "[^a]"
haystack = "δ"
matches = [[0, 1], [1, 2]]
unicode = false
utf8 = false
[[test]]
name = "dotstar-prefix1"
regex = "a"
haystack = '\xFFa'
matches = [[1, 2]]
unicode = false
utf8 = false
unescape = true
[[test]]
name = "dotstar-prefix2"
regex = "a"
haystack = '\xFFa'
matches = [[1, 2]]
utf8 = false
unescape = true
[[test]]
name = "null-bytes1"
regex = '[^\x00]+\x00'
haystack = 'foo\x00'
matches = [[0, 4]]
unicode = false
utf8 = false
unescape = true
[[test]]
name = "word-ascii"
regex = '\w+'
haystack = "aδ"
matches = [[0, 1]]
unicode = false
[[test]]
name = "word-unicode"
regex = '\w+'
haystack = "aδ"
matches = [[0, 3]]
[[test]]
name = "decimal-ascii"
regex = '\d+'
haystack = "1२३9"
matches = [[0, 1], [7, 8]]
unicode = false
[[test]]
name = "decimal-unicode"
regex = '\d+'
haystack = "1२३9"
matches = [[0, 8]]
[[test]]
name = "space-ascii"
regex = '\s+'
haystack = " \u1680"
matches = [[0, 1]]
unicode = false
[[test]]
name = "space-unicode"
regex = '\s+'
haystack = " \u1680"
matches = [[0, 4]]
[[test]]
# See: https://github.com/rust-lang/regex/issues/484
name = "iter1-bytes"
regex = ''
haystack = "☃"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
utf8 = false
[[test]]
# See: https://github.com/rust-lang/regex/issues/484
name = "iter1-utf8"
regex = ''
haystack = "☃"
matches = [[0, 0], [3, 3]]
[[test]]
# See: https://github.com/rust-lang/regex/issues/484
# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
name = "iter2-bytes"
regex = ''
haystack = 'b\xFFr'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
unescape = true
utf8 = false
# These test that unanchored prefixes can munch through invalid UTF-8 even when
# utf8 is enabled.
#
# This test actually reflects an interesting simplification in how the Thompson
# NFA is constructed. It used to be that the NFA could be built with an
# unanchored prefix that either matched any byte or _only_ matched valid UTF-8.
# But the latter turns out to be pretty precarious when it comes to prefilters,
# because if you search a haystack that contains invalid UTF-8 but have an
# unanchored prefix that requires UTF-8, then prefilters are no longer a valid
# optimization because you actually have to check that everything is valid
# UTF-8.
#
# Originally, I had thought that we needed a valid UTF-8 unanchored prefix in
# order to guarantee that we only match at valid UTF-8 boundaries. But this
# isn't actually true! There are really only two things to consider here:
#
# 1) Will a regex match split an encoded codepoint? No. Because by construction,
# we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming
# all of the UTF-8 modes are enabled).
#
# 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no,
# assuming all of the UTF-8 modes are enabled.
[[test]]
name = "unanchored-invalid-utf8-match-100"
regex = '[a-z]'
haystack = '\xFFa\xFF'
matches = [[1, 2]]
unescape = true
utf8 = false
# This test shows that we can still prevent a match from occurring by requiring
# that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the
# behavior of not munching through invalid UTF-8 anywhere is needed, then it
# can be achieved thusly.
[[test]]
name = "unanchored-invalid-utf8-nomatch"
regex = '^(?s:.)*?[a-z]'
haystack = '\xFFa\xFF'
matches = []
unescape = true
utf8 = false
# This is a tricky test that makes sure we don't accidentally do a kind of
# unanchored search when we've requested that a regex engine not report
# empty matches that split a codepoint. This test caught a regression during
# development where the code for skipping over bad empty matches would do so
# even if the search should have been anchored. This is ultimately what led to
# making 'anchored' an 'Input' option, so that it was always clear what kind
# of search was being performed. (Before that, whether a search was anchored
# or not was a config knob on the regex engine.) This did wind up making DFAs
# a little more complex to configure (with their 'StartKind' knob), but it
# generally smoothed out everything else.
#
# Great example of a test whose failure motivated a sweeping API refactoring.
[[test]]
name = "anchored-iter-empty-utf8"
regex = ''
haystack = 'a☃z'
matches = [[0, 0], [1, 1]]
unescape = false
utf8 = true
anchored = true

280
vendor/regex/testdata/overlapping.toml vendored Normal file
View File

@@ -0,0 +1,280 @@
# NOTE: We define a number of tests where the *match* kind is 'leftmost-first'
# but the *search* kind is 'overlapping'. This is a somewhat nonsensical
# combination and can produce odd results. Nevertheless, those results should
# be consistent so we test them here. (At the time of writing this note, I
# hadn't yet decided whether to make 'leftmost-first' with 'overlapping' result
# in unspecified behavior.)
# This demonstrates how a full overlapping search is obvious quadratic. This
# regex reports a match for every substring in the haystack.
[[test]]
name = "ungreedy-dotstar-matches-everything-100"
regex = [".*?"]
haystack = "zzz"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [0, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [1, 2] },
{ id = 0, span = [0, 2] },
{ id = 0, span = [3, 3] },
{ id = 0, span = [2, 3] },
{ id = 0, span = [1, 3] },
{ id = 0, span = [0, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "greedy-dotstar-matches-everything-100"
regex = [".*"]
haystack = "zzz"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [0, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [1, 2] },
{ id = 0, span = [0, 2] },
{ id = 0, span = [3, 3] },
{ id = 0, span = [2, 3] },
{ id = 0, span = [1, 3] },
{ id = 0, span = [0, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "repetition-plus-leftmost-first-100"
regex = 'a+'
haystack = "aaa"
matches = [[0, 1], [1, 2], [0, 2], [2, 3], [1, 3], [0, 3]]
match-kind = "leftmost-first"
search-kind = "overlapping"
[[test]]
name = "repetition-plus-leftmost-first-110"
regex = '☃+'
haystack = "☃☃☃"
matches = [[0, 3], [3, 6], [0, 6], [6, 9], [3, 9], [0, 9]]
match-kind = "leftmost-first"
search-kind = "overlapping"
[[test]]
name = "repetition-plus-all-100"
regex = 'a+'
haystack = "aaa"
matches = [[0, 1], [1, 2], [0, 2], [2, 3], [1, 3], [0, 3]]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "repetition-plus-all-110"
regex = '☃+'
haystack = "☃☃☃"
matches = [[0, 3], [3, 6], [0, 6], [6, 9], [3, 9], [0, 9]]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "repetition-plus-leftmost-first-200"
regex = '(abc)+'
haystack = "zzabcabczzabc"
matches = [
[[2, 5], [2, 5]],
[[5, 8], [5, 8]],
[[2, 8], [5, 8]],
]
match-kind = "leftmost-first"
search-kind = "overlapping"
[[test]]
name = "repetition-plus-all-200"
regex = '(abc)+'
haystack = "zzabcabczzabc"
matches = [
[[2, 5], [2, 5]],
[[5, 8], [5, 8]],
[[2, 8], [5, 8]],
[[10, 13], [10, 13]],
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "repetition-star-leftmost-first-100"
regex = 'a*'
haystack = "aaa"
matches = [
[0, 0],
[1, 1],
[0, 1],
[2, 2],
[1, 2],
[0, 2],
[3, 3],
[2, 3],
[1, 3],
[0, 3],
]
match-kind = "leftmost-first"
search-kind = "overlapping"
[[test]]
name = "repetition-star-all-100"
regex = 'a*'
haystack = "aaa"
matches = [
[0, 0],
[1, 1],
[0, 1],
[2, 2],
[1, 2],
[0, 2],
[3, 3],
[2, 3],
[1, 3],
[0, 3],
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "repetition-star-leftmost-first-200"
regex = '(abc)*'
haystack = "zzabcabczzabc"
matches = [
[[0, 0], []],
]
match-kind = "leftmost-first"
search-kind = "overlapping"
[[test]]
name = "repetition-star-all-200"
regex = '(abc)*'
haystack = "zzabcabczzabc"
matches = [
[[0, 0], []],
[[1, 1], []],
[[2, 2], []],
[[3, 3], []],
[[4, 4], []],
[[5, 5], []],
[[2, 5], [2, 5]],
[[6, 6], []],
[[7, 7], []],
[[8, 8], []],
[[5, 8], [5, 8]],
[[2, 8], [5, 8]],
[[9, 9], []],
[[10, 10], []],
[[11, 11], []],
[[12, 12], []],
[[13, 13], []],
[[10, 13], [10, 13]],
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "start-end-rep-leftmost-first"
regex = '(^$)*'
haystack = "abc"
matches = [
[[0, 0], []],
]
match-kind = "leftmost-first"
search-kind = "overlapping"
[[test]]
name = "start-end-rep-all"
regex = '(^$)*'
haystack = "abc"
matches = [
[[0, 0], []],
[[1, 1], []],
[[2, 2], []],
[[3, 3], []],
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "alt-leftmost-first-100"
regex = 'abc|a'
haystack = "zzabcazzaabc"
matches = [[2, 3], [2, 5]]
match-kind = "leftmost-first"
search-kind = "overlapping"
[[test]]
name = "alt-all-100"
regex = 'abc|a'
haystack = "zzabcazzaabc"
matches = [[2, 3], [2, 5], [5, 6], [8, 9], [9, 10], [9, 12]]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty-000"
regex = ""
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty-alt-000"
regex = "|b"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [1, 2], [3, 3]]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty-alt-010"
regex = "b|"
haystack = "abc"
matches = [[0, 0], [1, 1], [2, 2], [1, 2], [3, 3]]
match-kind = "all"
search-kind = "overlapping"
[[test]]
# See: https://github.com/rust-lang/regex/issues/484
name = "iter1-bytes"
regex = ''
haystack = "☃"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
utf8 = false
match-kind = "all"
search-kind = "overlapping"
[[test]]
# See: https://github.com/rust-lang/regex/issues/484
name = "iter1-utf8"
regex = ''
haystack = "☃"
matches = [[0, 0], [3, 3]]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "iter1-incomplete-utf8"
regex = ''
haystack = '\xE2\x98' # incomplete snowman
matches = [[0, 0], [1, 1], [2, 2]]
match-kind = "all"
search-kind = "overlapping"
unescape = true
utf8 = false
[[test]]
name = "scratch"
regex = ['sam', 'samwise']
haystack = "samwise"
matches = [
{ id = 0, span = [0, 3] },
]
match-kind = "leftmost-first"
search-kind = "overlapping"

98
vendor/regex/testdata/regex-lite.toml vendored Normal file
View File

@@ -0,0 +1,98 @@
# These tests are specifically written to test the regex-lite crate. While it
# largely has the same semantics as the regex crate, there are some differences
# around Unicode support and UTF-8.
#
# To be clear, regex-lite supports far fewer patterns because of its lack of
# Unicode support, nested character classes and character class set operations.
# What we're talking about here are the patterns that both crates support but
# where the semantics might differ.
# regex-lite uses ASCII definitions for Perl character classes.
[[test]]
name = "perl-class-decimal"
regex = '\d'
haystack = '᠕'
matches = []
unicode = true
# regex-lite uses ASCII definitions for Perl character classes.
[[test]]
name = "perl-class-space"
regex = '\s'
haystack = "\u2000"
matches = []
unicode = true
# regex-lite uses ASCII definitions for Perl character classes.
[[test]]
name = "perl-class-word"
regex = '\w'
haystack = 'δ'
matches = []
unicode = true
# regex-lite uses the ASCII definition of word for word boundary assertions.
[[test]]
name = "word-boundary"
regex = '\b'
haystack = 'δ'
matches = []
unicode = true
# regex-lite uses the ASCII definition of word for negated word boundary
# assertions. But note that it should still not split codepoints!
[[test]]
name = "word-boundary-negated"
regex = '\B'
haystack = 'δ'
matches = [[0, 0], [2, 2]]
unicode = true
# While we're here, the empty regex---which matches at every
# position---shouldn't split a codepoint either.
[[test]]
name = "empty-no-split-codepoint"
regex = ''
haystack = '💩'
matches = [[0, 0], [4, 4]]
unicode = true
# A dot always matches a full codepoint.
[[test]]
name = "dot-always-matches-codepoint"
regex = '.'
haystack = '💩'
matches = [[0, 4]]
unicode = false
# A negated character class also always matches a full codepoint.
[[test]]
name = "negated-class-always-matches-codepoint"
regex = '[^a]'
haystack = '💩'
matches = [[0, 4]]
unicode = false
# regex-lite only supports ASCII-aware case insensitive matching.
[[test]]
name = "case-insensitive-is-ascii-only"
regex = 's'
haystack = 'ſ'
matches = []
unicode = true
case-insensitive = true
# Negated word boundaries shouldn't split a codepoint, but they will match
# between invalid UTF-8.
#
# This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
# regex-lite. This can't happen in the main API because &str can't contain
# invalid UTF-8.
# [[test]]
# name = "word-boundary-invalid-utf8"
# regex = '\B'
# haystack = '\xFF\xFF\xFF\xFF'
# unescape = true
# matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
# unicode = true
# utf8 = false

830
vendor/regex/testdata/regression.toml vendored Normal file
View File

@@ -0,0 +1,830 @@
# See: https://github.com/rust-lang/regex/issues/48
[[test]]
name = "invalid-regex-no-crash-100"
regex = '(*)'
haystack = ""
matches = []
compiles = false
# See: https://github.com/rust-lang/regex/issues/48
[[test]]
name = "invalid-regex-no-crash-200"
regex = '(?:?)'
haystack = ""
matches = []
compiles = false
# See: https://github.com/rust-lang/regex/issues/48
[[test]]
name = "invalid-regex-no-crash-300"
regex = '(?)'
haystack = ""
matches = []
compiles = false
# See: https://github.com/rust-lang/regex/issues/48
[[test]]
name = "invalid-regex-no-crash-400"
regex = '*'
haystack = ""
matches = []
compiles = false
# See: https://github.com/rust-lang/regex/issues/75
[[test]]
name = "unsorted-binary-search-100"
regex = '(?i-u)[a_]+'
haystack = "A_"
matches = [[0, 2]]
# See: https://github.com/rust-lang/regex/issues/75
[[test]]
name = "unsorted-binary-search-200"
regex = '(?i-u)[A_]+'
haystack = "a_"
matches = [[0, 2]]
# See: https://github.com/rust-lang/regex/issues/76
[[test]]
name = "unicode-case-lower-nocase-flag"
regex = '(?i)\p{Ll}+'
haystack = "ΛΘΓΔα"
matches = [[0, 10]]
# See: https://github.com/rust-lang/regex/issues/99
[[test]]
name = "negated-char-class-100"
regex = '(?i)[^x]'
haystack = "x"
matches = []
# See: https://github.com/rust-lang/regex/issues/99
[[test]]
name = "negated-char-class-200"
regex = '(?i)[^x]'
haystack = "X"
matches = []
# See: https://github.com/rust-lang/regex/issues/101
[[test]]
name = "ascii-word-underscore"
regex = '[[:word:]]'
haystack = "_"
matches = [[0, 1]]
# See: https://github.com/rust-lang/regex/issues/129
[[test]]
name = "captures-repeat"
regex = '([a-f]){2}(?P<foo>[x-z])'
haystack = "abx"
matches = [
[[0, 3], [1, 2], [2, 3]],
]
# See: https://github.com/rust-lang/regex/issues/153
[[test]]
name = "alt-in-alt-100"
regex = 'ab?|$'
haystack = "az"
matches = [[0, 1], [2, 2]]
# See: https://github.com/rust-lang/regex/issues/153
[[test]]
name = "alt-in-alt-200"
regex = '^(?:.*?)(?:\n|\r\n?|$)'
haystack = "ab\rcd"
matches = [[0, 3]]
# See: https://github.com/rust-lang/regex/issues/169
[[test]]
name = "leftmost-first-prefix"
regex = 'z*azb'
haystack = "azb"
matches = [[0, 3]]
# See: https://github.com/rust-lang/regex/issues/191
[[test]]
name = "many-alternates"
regex = '1|2|3|4|5|6|7|8|9|10|int'
haystack = "int"
matches = [[0, 3]]
# See: https://github.com/rust-lang/regex/issues/204
[[test]]
name = "word-boundary-alone-100"
regex = '\b'
haystack = "Should this (work?)"
matches = [[0, 0], [6, 6], [7, 7], [11, 11], [13, 13], [17, 17]]
# See: https://github.com/rust-lang/regex/issues/204
[[test]]
name = "word-boundary-alone-200"
regex = '\b'
haystack = "a b c"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
# See: https://github.com/rust-lang/regex/issues/264
[[test]]
name = "word-boundary-ascii-no-capture"
regex = '\B'
haystack = "\U00028F3E"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
unicode = false
utf8 = false
# See: https://github.com/rust-lang/regex/issues/264
[[test]]
name = "word-boundary-ascii-capture"
regex = '(?:\B)'
haystack = "\U00028F3E"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
unicode = false
utf8 = false
# See: https://github.com/rust-lang/regex/issues/268
[[test]]
name = "partial-anchor"
regex = '^a|b'
haystack = "ba"
matches = [[0, 1]]
# See: https://github.com/rust-lang/regex/issues/271
[[test]]
name = "endl-or-word-boundary"
regex = '(?m:$)|(?-u:\b)'
haystack = "\U0006084E"
matches = [[4, 4]]
# See: https://github.com/rust-lang/regex/issues/271
[[test]]
name = "zero-or-end"
regex = '(?i-u:\x00)|$'
haystack = "\U000E682F"
matches = [[4, 4]]
# See: https://github.com/rust-lang/regex/issues/271
[[test]]
name = "y-or-endl"
regex = '(?i-u:y)|(?m:$)'
haystack = "\U000B4331"
matches = [[4, 4]]
# See: https://github.com/rust-lang/regex/issues/271
[[test]]
name = "word-boundary-start-x"
regex = '(?u:\b)^(?-u:X)'
haystack = "X"
matches = [[0, 1]]
# See: https://github.com/rust-lang/regex/issues/271
[[test]]
name = "word-boundary-ascii-start-x"
regex = '(?-u:\b)^(?-u:X)'
haystack = "X"
matches = [[0, 1]]
# See: https://github.com/rust-lang/regex/issues/271
[[test]]
name = "end-not-word-boundary"
regex = '$\B'
haystack = "\U0005C124\U000B576C"
matches = [[8, 8]]
unicode = false
utf8 = false
# See: https://github.com/rust-lang/regex/issues/280
[[test]]
name = "partial-anchor-alternate-begin"
regex = '^a|z'
haystack = "yyyyya"
matches = []
# See: https://github.com/rust-lang/regex/issues/280
[[test]]
name = "partial-anchor-alternate-end"
regex = 'a$|z'
haystack = "ayyyyy"
matches = []
# See: https://github.com/rust-lang/regex/issues/289
[[test]]
name = "lits-unambiguous-100"
regex = '(?:ABC|CDA|BC)X'
haystack = "CDAX"
matches = [[0, 4]]
# See: https://github.com/rust-lang/regex/issues/291
[[test]]
name = "lits-unambiguous-200"
regex = '((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$'
haystack = "CIMG2341"
matches = [
[[0, 8], [0, 4], [], [0, 4], [4, 8]],
]
# See: https://github.com/rust-lang/regex/issues/303
#
# 2022-09-19: This has now been "properly" fixed in that empty character
# classes are fully supported as something that can never match. This test
# used to be marked as 'compiles = false', but now it works.
[[test]]
name = "negated-full-byte-range"
regex = '[^\x00-\xFF]'
haystack = ""
matches = []
compiles = true
unicode = false
utf8 = false
# See: https://github.com/rust-lang/regex/issues/321
[[test]]
name = "strange-anchor-non-complete-prefix"
regex = 'a^{2}'
haystack = ""
matches = []
# See: https://github.com/rust-lang/regex/issues/321
[[test]]
name = "strange-anchor-non-complete-suffix"
regex = '${2}a'
haystack = ""
matches = []
# See: https://github.com/rust-lang/regex/issues/334
# See: https://github.com/rust-lang/regex/issues/557
[[test]]
name = "captures-after-dfa-premature-end-100"
regex = 'a(b*(X|$))?'
haystack = "abcbX"
matches = [
[[0, 1], [], []],
]
# See: https://github.com/rust-lang/regex/issues/334
# See: https://github.com/rust-lang/regex/issues/557
[[test]]
name = "captures-after-dfa-premature-end-200"
regex = 'a(bc*(X|$))?'
haystack = "abcbX"
matches = [
[[0, 1], [], []],
]
# See: https://github.com/rust-lang/regex/issues/334
# See: https://github.com/rust-lang/regex/issues/557
[[test]]
name = "captures-after-dfa-premature-end-300"
regex = '(aa$)?'
haystack = "aaz"
matches = [
[[0, 0], []],
[[1, 1], []],
[[2, 2], []],
[[3, 3], []],
]
# Plucked from "Why arent regular expressions a lingua franca? an empirical
# study on the re-use and portability of regular expressions", The ACM Joint
# European Software Engineering Conference and Symposium on the Foundations of
# Software Engineering (ESEC/FSE), 2019.
#
# Link: https://dl.acm.org/doi/pdf/10.1145/3338906.3338909
[[test]]
name = "captures-after-dfa-premature-end-400"
regex = '(a)\d*\.?\d+\b'
haystack = "a0.0c"
matches = [
[[0, 2], [0, 1]],
]
# See: https://github.com/rust-lang/regex/issues/437
[[test]]
name = "literal-panic"
regex = 'typename type\-parameter\-[0-9]+\-[0-9]+::.+'
haystack = "test"
matches = []
# See: https://github.com/rust-lang/regex/issues/527
[[test]]
name = "empty-flag-expr"
regex = '(?:(?:(?x)))'
haystack = ""
matches = [[0, 0]]
# See: https://github.com/rust-lang/regex/issues/533
#[[tests]]
#name = "blank-matches-nothing-between-space-and-tab"
#regex = '[[:blank:]]'
#input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
#match = false
#unescape = true
# See: https://github.com/rust-lang/regex/issues/533
#[[tests]]
#name = "blank-matches-nothing-between-space-and-tab-inverted"
#regex = '^[[:^blank:]]+$'
#input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
#match = true
#unescape = true
# See: https://github.com/rust-lang/regex/issues/555
[[test]]
name = "invalid-repetition"
regex = '(?m){1,1}'
haystack = ""
matches = []
compiles = false
# See: https://github.com/rust-lang/regex/issues/640
[[test]]
name = "flags-are-unset"
regex = '(?:(?i)foo)|Bar'
haystack = "foo Foo bar Bar"
matches = [[0, 3], [4, 7], [12, 15]]
# Note that 'Ј' is not 'j', but cyrillic Je
# https://en.wikipedia.org/wiki/Je_(Cyrillic)
#
# See: https://github.com/rust-lang/regex/issues/659
[[test]]
name = "empty-group-with-unicode"
regex = '(?:)Ј01'
haystack = 'zЈ01'
matches = [[1, 5]]
# See: https://github.com/rust-lang/regex/issues/579
[[test]]
name = "word-boundary-weird"
regex = '\b..\b'
haystack = "I have 12, he has 2!"
matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
# See: https://github.com/rust-lang/regex/issues/579
[[test]]
name = "word-boundary-weird-ascii"
regex = '\b..\b'
haystack = "I have 12, he has 2!"
matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
unicode = false
utf8 = false
# See: https://github.com/rust-lang/regex/issues/579
[[test]]
name = "word-boundary-weird-minimal-ascii"
regex = '\b..\b'
haystack = "az,,b"
matches = [[0, 2], [2, 4]]
unicode = false
utf8 = false
# See: https://github.com/BurntSushi/ripgrep/issues/1203
[[test]]
name = "reverse-suffix-100"
regex = '[0-4][0-4][0-4]000'
haystack = "153.230000"
matches = [[4, 10]]
# See: https://github.com/BurntSushi/ripgrep/issues/1203
[[test]]
name = "reverse-suffix-200"
regex = '[0-9][0-9][0-9]000'
haystack = "153.230000\n"
matches = [[4, 10]]
# This is a tricky case for the reverse suffix optimization, because it
# finds the 'foobar' match but the reverse scan must fail to find a match by
# correctly dealing with the word boundary following the 'foobar' literal when
# computing the start state.
#
# This test exists because I tried to break the following assumption that
# is currently in the code: that if a suffix is found and the reverse scan
# succeeds, then it's guaranteed that there is an overall match. Namely, the
# 'is_match' routine does *not* do another forward scan in this case because of
# this assumption.
[[test]]
name = "reverse-suffix-300"
regex = '\w+foobar\b'
haystack = "xyzfoobarZ"
matches = []
unicode = false
utf8 = false
# See: https://github.com/BurntSushi/ripgrep/issues/1247
[[test]]
name = "stops"
regex = '\bs(?:[ab])'
haystack = 's\xE4'
matches = []
unescape = true
utf8 = false
# See: https://github.com/BurntSushi/ripgrep/issues/1247
[[test]]
name = "stops-ascii"
regex = '(?-u:\b)s(?:[ab])'
haystack = 's\xE4'
matches = []
unescape = true
utf8 = false
# See: https://github.com/rust-lang/regex/issues/850
[[test]]
name = "adjacent-line-boundary-100"
regex = '(?m)^(?:[^ ]+?)$'
haystack = "line1\nline2"
matches = [[0, 5], [6, 11]]
# Continued.
[[test]]
name = "adjacent-line-boundary-200"
regex = '(?m)^(?:[^ ]+?)$'
haystack = "A\nB"
matches = [[0, 1], [2, 3]]
# There is no issue for this bug.
[[test]]
name = "anchored-prefix-100"
regex = '^a[[:^space:]]'
haystack = "a "
matches = []
# There is no issue for this bug.
[[test]]
name = "anchored-prefix-200"
regex = '^a[[:^space:]]'
haystack = "foo boo a"
matches = []
# There is no issue for this bug.
[[test]]
name = "anchored-prefix-300"
regex = '^-[a-z]'
haystack = "r-f"
matches = []
# Tests that a possible Aho-Corasick optimization works correctly. It only
# kicks in when we have a lot of literals. By "works correctly," we mean that
# leftmost-first match semantics are properly respected. That is, samwise
# should match, not sam.
#
# There is no issue for this bug.
[[test]]
name = "aho-corasick-100"
regex = 'samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z'
haystack = "samwise"
matches = [[0, 7]]
# See: https://github.com/rust-lang/regex/issues/921
[[test]]
name = "interior-anchor-capture"
regex = '(a$)b$'
haystack = 'ab'
matches = []
# I found this bug in the course of adding some of the regexes that Ruff uses
# to rebar. It turns out that the lazy DFA was finding a match that was being
# rejected by the one-pass DFA. Yikes. I then minimized the regex and haystack.
#
# Source: https://github.com/charliermarsh/ruff/blob/a919041ddaa64cdf6f216f90dd0480dab69fd3ba/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs#L52
[[test]]
name = "ruff-whitespace-around-keywords"
regex = '^(a|ab)$'
haystack = "ab"
anchored = true
unicode = false
utf8 = true
matches = [[[0, 2], [0, 2]]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-0"
regex = '(?:(?-u:\b)|(?u:h))+'
haystack = "h"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-1"
regex = '(?u:\B)'
haystack = "鋸"
unicode = true
utf8 = false
matches = []
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-2"
regex = '(?:(?u:\b)|(?s-u:.))+'
haystack = "oB"
unicode = true
utf8 = false
matches = [[0, 0], [1, 2]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-3"
regex = '(?:(?-u:\B)|(?su:.))+'
haystack = "\U000FEF80"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-3-utf8"
regex = '(?:(?-u:\B)|(?su:.))+'
haystack = "\U000FEF80"
unicode = true
utf8 = true
matches = [[0, 0], [4, 4]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-4"
regex = '(?m:$)(?m:^)(?su:.)'
haystack = "\n‣"
unicode = true
utf8 = false
matches = [[0, 1]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-5"
regex = '(?m:$)^(?m:^)'
haystack = "\n"
unicode = true
utf8 = false
matches = [[0, 0]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-6"
regex = '(?P<kp>(?iu:do)(?m:$))*'
haystack = "dodo"
unicode = true
utf8 = false
matches = [
[[0, 0], []],
[[1, 1], []],
[[2, 4], [2, 4]],
]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-7"
regex = '(?u:\B)'
haystack = "䡁"
unicode = true
utf8 = false
matches = []
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-8"
regex = '(?:(?-u:\b)|(?u:[\u{0}-W]))+'
haystack = "0"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-9"
regex = '((?m:$)(?-u:\B)(?s-u:.)(?-u:\B)$)'
haystack = "\n\n"
unicode = true
utf8 = false
matches = [
[[1, 2], [1, 2]],
]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-10"
regex = '(?m:$)(?m:$)^(?su:.)'
haystack = "\n\u0081¨\u200a"
unicode = true
utf8 = false
matches = [[0, 1]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-11"
regex = '(?-u:\B)(?m:^)'
haystack = "0\n"
unicode = true
utf8 = false
matches = [[2, 2]]
# From: https://github.com/rust-lang/regex/issues/429
[[test]]
name = "i429-12"
regex = '(?:(?u:\b)|(?-u:.))+'
haystack = "0"
unicode = true
utf8 = false
matches = [[0, 0], [1, 1]]
# From: https://github.com/rust-lang/regex/issues/969
[[test]]
name = "i969"
regex = 'c.*d\z'
haystack = "ababcd"
bounds = [4, 6]
search-kind = "earliest"
matches = [[4, 6]]
# I found this during the regex-automata migration. This is the fowler basic
# 154 test, but without anchored = true and without a match limit.
#
# This test caught a subtle bug in the hybrid reverse DFA search, where it
# would skip over the termination condition if it entered a start state. This
# was a double bug. Firstly, the reverse DFA shouldn't have had start states
# specialized in the first place, and thus it shouldn't have possible to detect
# that the DFA had entered a start state. The second bug was that the start
# state handling was incorrect by jumping over the termination condition.
[[test]]
name = "fowler-basic154-unanchored"
regex = '''a([bc]*)c*'''
haystack = '''abc'''
matches = [[[0, 3], [1, 3]]]
# From: https://github.com/rust-lang/regex/issues/981
#
# This was never really a problem in the new architecture because the
# regex-automata engines are far more principled about how they deal with
# look-around. (This was one of the many reasons I wanted to re-work the
# original regex crate engines.)
[[test]]
name = "word-boundary-interact-poorly-with-literal-optimizations"
regex = '(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))'
haystack = 'ubi-Darwin-x86_64.tar.gz'
matches = []
# This was found during fuzz testing of regex. It provoked a panic in the meta
# engine as a result of the reverse suffix optimization. Namely, it hit a case
# where a suffix match was found, a corresponding reverse match was found, but
# the forward search turned up no match. The forward search should always match
# if the suffix and reverse search match.
#
# This in turn uncovered an inconsistency between the PikeVM and the DFA (lazy
# and fully compiled) engines. It was caused by a mishandling of the collection
# of NFA state IDs in the generic determinization code (which is why both types
# of DFA were impacted). Namely, when a fail state was encountered (that's the
# `[^\s\S]` in the pattern below), then it would just stop collecting states.
# But that's not correct since a later state could lead to a match.
[[test]]
name = "impossible-branch"
regex = '.*[^\s\S]A|B'
haystack = "B"
matches = [[0, 1]]
# This was found during fuzz testing in regex-lite. The regex crate never
# suffered from this bug, but it causes regex-lite to incorrectly compile
# captures.
[[test]]
name = "captures-wrong-order"
regex = '(a){0}(a)'
haystack = 'a'
matches = [[[0, 1], [], [0, 1]]]
# This tests a bug in how quit states are handled in the DFA. At some point
# during development, the DFAs were tweaked slightly such that if they hit
# a quit state (which means, they hit a byte that the caller configured should
# stop the search), then it might not return an error necessarily. Namely, if a
# match had already been found, then it would be returned instead of an error.
#
# But this is actually wrong! Why? Because even though a match had been found,
# it wouldn't be fully correct to return it once a quit state has been seen
# because you can't determine whether the match offset returned is the correct
# greedy/leftmost-first match. Since you can't complete the search as requested
# by the caller, the DFA should just stop and return an error.
#
# Interestingly, this does seem to produce an unavoidable difference between
# 'try_is_match().unwrap()' and 'try_find().unwrap().is_some()' for the DFAs.
# The former will stop immediately once a match is known to occur and return
# 'Ok(true)', where as the latter could find the match but quit with an
# 'Err(..)' first.
#
# Thankfully, I believe this inconsistency between 'is_match()' and 'find()'
# cannot be observed in the higher level meta regex API because it specifically
# will try another engine that won't fail in the case of a DFA failing.
#
# This regression happened in the regex crate rewrite, but before anything got
# released.
[[test]]
name = "negated-unicode-word-boundary-dfa-fail"
regex = '\B.*'
haystack = "!\u02D7"
matches = [[0, 3]]
# This failure was found in the *old* regex crate (prior to regex 1.9), but
# I didn't investigate why. My best guess is that it's a literal optimization
# bug. It didn't occur in the rewrite.
[[test]]
name = "missed-match"
regex = 'e..+e.ee>'
haystack = 'Zeee.eZZZZZZZZeee>eeeeeee>'
matches = [[1, 26]]
# This test came from the 'ignore' crate and tripped a bug in how accelerated
# DFA states were handled in an overlapping search.
[[test]]
name = "regex-to-glob"
regex = ['(?-u)^path1/[^/]*$']
haystack = "path1/foo"
matches = [[0, 9]]
utf8 = false
match-kind = "all"
search-kind = "overlapping"
# See: https://github.com/rust-lang/regex/issues/1060
[[test]]
name = "reverse-inner-plus-shorter-than-expected"
regex = '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})'
haystack = '102:12:39'
matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]
# Like reverse-inner-plus-shorter-than-expected, but using a far simpler regex
# to demonstrate the extent of the rot. Sigh.
#
# See: https://github.com/rust-lang/regex/issues/1060
[[test]]
name = "reverse-inner-short"
regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])'
haystack = '102:12:39'
matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]
# This regression test was found via the RegexSet APIs. It triggered a
# particular code path where a regex was compiled with 'All' match semantics
# (to support overlapping search), but got funneled down into a standard
# leftmost search when calling 'is_match'. This is fine on its own, but the
# leftmost search will use a prefilter and that's where this went awry.
#
# Namely, since 'All' semantics were used, the aho-corasick prefilter was
# incorrectly compiled with 'Standard' semantics. This was wrong because
# 'Standard' immediately attempts to report a match at every position, even if
# that would mean reporting a match past the leftmost match before reporting
# the leftmost match. This breaks the prefilter contract of never having false
# negatives and leads overall to the engine not finding a match.
#
# See: https://github.com/rust-lang/regex/issues/1070
[[test]]
name = "prefilter-with-aho-corasick-standard-semantics"
regex = '(?m)^ *v [0-9]'
haystack = 'v 0'
matches = [
{ id = 0, spans = [[0, 3]] },
]
match-kind = "all"
search-kind = "overlapping"
unicode = true
utf8 = true
# This tests that the PikeVM and the meta regex agree on a particular regex.
# This test previously failed when the ad hoc engines inside the meta engine
# did not handle quit states correctly. Namely, the Unicode word boundary here
# combined with a non-ASCII codepoint provokes the quit state. The ad hoc
# engines were previously returning a match even after entering the quit state
# if a match had been previously detected, but this is incorrect. The reason
# is that if a quit state is found, then the search must give up *immediately*
# because it prevents the search from finding the "proper" leftmost-first
# match. If it instead returns a match that has been found, it risks reporting
# an improper match, as it did in this case.
#
# See: https://github.com/rust-lang/regex/issues/1046
[[test]]
name = "non-prefix-literal-quit-state"
regex = '.+\b\n'
haystack = "β77\n"
matches = [[0, 5]]
# This is a regression test for some errant HIR interval set operations that
# were made in the regex-syntax 0.8.0 release and then reverted in 0.8.1. The
# issue here is that the HIR produced from the regex had out-of-order ranges.
#
# See: https://github.com/rust-lang/regex/issues/1103
# Ref: https://github.com/rust-lang/regex/pull/1051
# Ref: https://github.com/rust-lang/regex/pull/1102
[[test]]
name = "hir-optimization-out-of-order-class"
regex = '^[[:alnum:]./-]+$'
haystack = "a-b"
matches = [[0, 3]]
# This is a regression test for an improper reverse suffix optimization. This
# occurred when I "broadened" the applicability of the optimization to include
# multiple possible literal suffixes instead of only sticking to a non-empty
# longest common suffix. It turns out that, at least given how the reverse
# suffix optimization works, we need to stick to the longest common suffix for
# now.
#
# See: https://github.com/rust-lang/regex/issues/1110
# See also: https://github.com/astral-sh/ruff/pull/7980
[[test]]
name = 'improper-reverse-suffix-optimization'
regex = '(\\N\{[^}]+})|([{}])'
haystack = 'hiya \N{snowman} bye'
matches = [[[5, 16], [5, 16], []]]

641
vendor/regex/testdata/set.toml vendored Normal file
View File

@@ -0,0 +1,641 @@
# Basic multi-regex tests.
[[test]]
name = "basic10"
regex = ["a", "a"]
haystack = "a"
matches = [
{ id = 0, span = [0, 1] },
{ id = 1, span = [0, 1] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic10-leftmost-first"
regex = ["a", "a"]
haystack = "a"
matches = [
{ id = 0, span = [0, 1] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "basic20"
regex = ["a", "a"]
haystack = "ba"
matches = [
{ id = 0, span = [1, 2] },
{ id = 1, span = [1, 2] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic30"
regex = ["a", "b"]
haystack = "a"
matches = [
{ id = 0, span = [0, 1] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic40"
regex = ["a", "b"]
haystack = "b"
matches = [
{ id = 1, span = [0, 1] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic50"
regex = ["a|b", "b|a"]
haystack = "b"
matches = [
{ id = 0, span = [0, 1] },
{ id = 1, span = [0, 1] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic60"
regex = ["foo", "oo"]
haystack = "foo"
matches = [
{ id = 0, span = [0, 3] },
{ id = 1, span = [1, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic60-leftmost-first"
regex = ["foo", "oo"]
haystack = "foo"
matches = [
{ id = 0, span = [0, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "basic61"
regex = ["oo", "foo"]
haystack = "foo"
matches = [
{ id = 1, span = [0, 3] },
{ id = 0, span = [1, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic61-leftmost-first"
regex = ["oo", "foo"]
haystack = "foo"
matches = [
{ id = 1, span = [0, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "basic70"
regex = ["abcd", "bcd", "cd", "d"]
haystack = "abcd"
matches = [
{ id = 0, span = [0, 4] },
{ id = 1, span = [1, 4] },
{ id = 2, span = [2, 4] },
{ id = 3, span = [3, 4] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic71"
regex = ["bcd", "cd", "d", "abcd"]
haystack = "abcd"
matches = [
{ id = 3, span = [0, 4] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "basic80"
regex = ["^foo", "bar$"]
haystack = "foo"
matches = [
{ id = 0, span = [0, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic81"
regex = ["^foo", "bar$"]
haystack = "foo bar"
matches = [
{ id = 0, span = [0, 3] },
{ id = 1, span = [4, 7] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic82"
regex = ["^foo", "bar$"]
haystack = "bar"
matches = [
{ id = 1, span = [0, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic90"
regex = ["[a-z]+$", "foo"]
haystack = "01234 foo"
matches = [
{ id = 0, span = [8, 9] },
{ id = 0, span = [7, 9] },
{ id = 0, span = [6, 9] },
{ id = 1, span = [6, 9] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic91"
regex = ["[a-z]+$", "foo"]
haystack = "foo 01234"
matches = [
{ id = 1, span = [0, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic100"
regex = [".*?", "a"]
haystack = "zzza"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [0, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [1, 2] },
{ id = 0, span = [0, 2] },
{ id = 0, span = [3, 3] },
{ id = 0, span = [2, 3] },
{ id = 0, span = [1, 3] },
{ id = 0, span = [0, 3] },
{ id = 0, span = [4, 4] },
{ id = 0, span = [3, 4] },
{ id = 0, span = [2, 4] },
{ id = 0, span = [1, 4] },
{ id = 0, span = [0, 4] },
{ id = 1, span = [3, 4] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic101"
regex = [".*", "a"]
haystack = "zzza"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [0, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [1, 2] },
{ id = 0, span = [0, 2] },
{ id = 0, span = [3, 3] },
{ id = 0, span = [2, 3] },
{ id = 0, span = [1, 3] },
{ id = 0, span = [0, 3] },
{ id = 0, span = [4, 4] },
{ id = 0, span = [3, 4] },
{ id = 0, span = [2, 4] },
{ id = 0, span = [1, 4] },
{ id = 0, span = [0, 4] },
{ id = 1, span = [3, 4] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic102"
regex = [".*", "a"]
haystack = "zzz"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [0, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [1, 2] },
{ id = 0, span = [0, 2] },
{ id = 0, span = [3, 3] },
{ id = 0, span = [2, 3] },
{ id = 0, span = [1, 3] },
{ id = 0, span = [0, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic110"
regex = ['\ba\b']
haystack = "hello a bye"
matches = [
{ id = 0, span = [6, 7] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic111"
regex = ['\ba\b', '\be\b']
haystack = "hello a bye e"
matches = [
{ id = 0, span = [6, 7] },
{ id = 1, span = [12, 13] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic120"
regex = ["a"]
haystack = "a"
matches = [
{ id = 0, span = [0, 1] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic121"
regex = [".*a"]
haystack = "a"
matches = [
{ id = 0, span = [0, 1] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic122"
regex = [".*a", "β"]
haystack = "β"
matches = [
{ id = 1, span = [0, 2] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "basic130"
regex = ["ab", "b"]
haystack = "ba"
matches = [
{ id = 1, span = [0, 1] },
]
match-kind = "all"
search-kind = "overlapping"
# These test cases where one of the regexes matches the empty string.
[[test]]
name = "empty10"
regex = ["", "a"]
haystack = "abc"
matches = [
{ id = 0, span = [0, 0] },
{ id = 1, span = [0, 1] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty10-leftmost-first"
regex = ["", "a"]
haystack = "abc"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [3, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "empty11"
regex = ["a", ""]
haystack = "abc"
matches = [
{ id = 1, span = [0, 0] },
{ id = 0, span = [0, 1] },
{ id = 1, span = [1, 1] },
{ id = 1, span = [2, 2] },
{ id = 1, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty11-leftmost-first"
regex = ["a", ""]
haystack = "abc"
matches = [
{ id = 0, span = [0, 1] },
{ id = 1, span = [2, 2] },
{ id = 1, span = [3, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "empty20"
regex = ["", "b"]
haystack = "abc"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 1, span = [1, 2] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty20-leftmost-first"
regex = ["", "b"]
haystack = "abc"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [3, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "empty21"
regex = ["b", ""]
haystack = "abc"
matches = [
{ id = 1, span = [0, 0] },
{ id = 1, span = [1, 1] },
{ id = 0, span = [1, 2] },
{ id = 1, span = [2, 2] },
{ id = 1, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty21-leftmost-first"
regex = ["b", ""]
haystack = "abc"
matches = [
{ id = 1, span = [0, 0] },
{ id = 0, span = [1, 2] },
{ id = 1, span = [3, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "empty22"
regex = ["(?:)", "b"]
haystack = "abc"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 1, span = [1, 2] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty23"
regex = ["b", "(?:)"]
haystack = "abc"
matches = [
{ id = 1, span = [0, 0] },
{ id = 1, span = [1, 1] },
{ id = 0, span = [1, 2] },
{ id = 1, span = [2, 2] },
{ id = 1, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty30"
regex = ["", "z"]
haystack = "abc"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty30-leftmost-first"
regex = ["", "z"]
haystack = "abc"
matches = [
{ id = 0, span = [0, 0] },
{ id = 0, span = [1, 1] },
{ id = 0, span = [2, 2] },
{ id = 0, span = [3, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "empty31"
regex = ["z", ""]
haystack = "abc"
matches = [
{ id = 1, span = [0, 0] },
{ id = 1, span = [1, 1] },
{ id = 1, span = [2, 2] },
{ id = 1, span = [3, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty31-leftmost-first"
regex = ["z", ""]
haystack = "abc"
matches = [
{ id = 1, span = [0, 0] },
{ id = 1, span = [1, 1] },
{ id = 1, span = [2, 2] },
{ id = 1, span = [3, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
[[test]]
name = "empty40"
regex = ["c(?:)", "b"]
haystack = "abc"
matches = [
{ id = 1, span = [1, 2] },
{ id = 0, span = [2, 3] },
]
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "empty40-leftmost-first"
regex = ["c(?:)", "b"]
haystack = "abc"
matches = [
{ id = 1, span = [1, 2] },
{ id = 0, span = [2, 3] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
# These test cases where there are no matches.
[[test]]
name = "nomatch10"
regex = ["a", "a"]
haystack = "b"
matches = []
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "nomatch20"
regex = ["^foo", "bar$"]
haystack = "bar foo"
matches = []
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "nomatch30"
regex = []
haystack = "a"
matches = []
match-kind = "all"
search-kind = "overlapping"
[[test]]
name = "nomatch40"
regex = ["^rooted$", '\.log$']
haystack = "notrooted"
matches = []
match-kind = "all"
search-kind = "overlapping"
# These test multi-regex searches with capture groups.
#
# NOTE: I wrote these tests in the course of developing a first class API for
# overlapping capturing group matches, but ultimately removed that API because
# the semantics for overlapping matches aren't totally clear. However, I've
# left the tests because I believe the semantics for these patterns are clear
# and because we can still test our "which patterns matched" APIs with them.
[[test]]
name = "caps-010"
regex = ['^(\w+) (\w+)$', '^(\S+) (\S+)$']
haystack = "Bruce Springsteen"
matches = [
{ id = 0, spans = [[0, 17], [0, 5], [6, 17]] },
{ id = 1, spans = [[0, 17], [0, 5], [6, 17]] },
]
match-kind = "all"
search-kind = "overlapping"
unicode = false
utf8 = false
[[test]]
name = "caps-020"
regex = ['^(\w+) (\w+)$', '^[A-Z](\S+) [A-Z](\S+)$']
haystack = "Bruce Springsteen"
matches = [
{ id = 0, spans = [[0, 17], [0, 5], [6, 17]] },
{ id = 1, spans = [[0, 17], [1, 5], [7, 17]] },
]
match-kind = "all"
search-kind = "overlapping"
unicode = false
utf8 = false
[[test]]
name = "caps-030"
regex = ['^(\w+) (\w+)$', '^([A-Z])(\S+) ([A-Z])(\S+)$']
haystack = "Bruce Springsteen"
matches = [
{ id = 0, spans = [[0, 17], [0, 5], [6, 17]] },
{ id = 1, spans = [[0, 17], [0, 1], [1, 5], [6, 7], [7, 17]] },
]
match-kind = "all"
search-kind = "overlapping"
unicode = false
utf8 = false
[[test]]
name = "caps-110"
regex = ['(\w+) (\w+)', '(\S+) (\S+)']
haystack = "Bruce Springsteen"
matches = [
{ id = 0, spans = [[0, 17], [0, 5], [6, 17]] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
unicode = false
utf8 = false
[[test]]
name = "caps-120"
regex = ['(\w+) (\w+)', '(\S+) (\S+)']
haystack = "&ruce $pringsteen"
matches = [
{ id = 1, spans = [[0, 17], [0, 5], [6, 17]] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
unicode = false
utf8 = false
[[test]]
name = "caps-121"
regex = ['(\w+) (\w+)', '(\S+) (\S+)']
haystack = "&ruce $pringsteen Foo Bar"
matches = [
{ id = 1, spans = [[0, 17], [0, 5], [6, 17]] },
{ id = 0, spans = [[18, 25], [18, 21], [22, 25]] },
]
match-kind = "leftmost-first"
search-kind = "leftmost"
unicode = false
utf8 = false

36
vendor/regex/testdata/substring.toml vendored Normal file
View File

@@ -0,0 +1,36 @@
# These tests check that regex engines perform as expected when the search is
# instructed to only search a substring of a haystack instead of the entire
# haystack. This tends to exercise interesting edge cases that are otherwise
# difficult to provoke. (But not necessarily impossible. Regex search iterators
# for example, make use of the "search just a substring" APIs by changing the
# starting position of a search to the end position of the previous match.)
[[test]]
name = "unicode-word-start"
regex = '\b[0-9]+\b'
haystack = "β123"
bounds = { start = 2, end = 5 }
matches = []
[[test]]
name = "unicode-word-end"
regex = '\b[0-9]+\b'
haystack = "123β"
bounds = { start = 0, end = 3 }
matches = []
[[test]]
name = "ascii-word-start"
regex = '\b[0-9]+\b'
haystack = "β123"
bounds = { start = 2, end = 5 }
matches = [[2, 5]]
unicode = false
[[test]]
name = "ascii-word-end"
regex = '\b[0-9]+\b'
haystack = "123β"
bounds = { start = 0, end = 3 }
matches = [[0, 3]]
unicode = false

517
vendor/regex/testdata/unicode.toml vendored Normal file
View File

@@ -0,0 +1,517 @@
# Basic Unicode literal support.
[[test]]
name = "literal1"
regex = '☃'
haystack = "☃"
matches = [[0, 3]]
[[test]]
name = "literal2"
regex = '☃+'
haystack = "☃"
matches = [[0, 3]]
[[test]]
name = "literal3"
regex = '☃+'
haystack = "☃"
matches = [[0, 3]]
case-insensitive = true
[[test]]
name = "literal4"
regex = 'Δ'
haystack = "δ"
matches = [[0, 2]]
case-insensitive = true
# Unicode word boundaries.
[[test]]
name = "wb-100"
regex = '\d\b'
haystack = "6δ"
matches = []
[[test]]
name = "wb-200"
regex = '\d\b'
haystack = "6"
matches = [[0, 1]]
[[test]]
name = "wb-300"
regex = '\d\B'
haystack = "6δ"
matches = [[0, 1]]
[[test]]
name = "wb-400"
regex = '\d\B'
haystack = "6"
matches = []
# Unicode character class support.
[[test]]
name = "class1"
regex = '[☃Ⅰ]+'
haystack = "☃"
matches = [[0, 3]]
[[test]]
name = "class2"
regex = '\pN'
haystack = ""
matches = [[0, 3]]
[[test]]
name = "class3"
regex = '\pN+'
haystack = "1Ⅱ2"
matches = [[0, 8]]
[[test]]
name = "class4"
regex = '\PN+'
haystack = "ab"
matches = [[0, 2]]
[[test]]
name = "class5"
regex = '[\PN]+'
haystack = "ab"
matches = [[0, 2]]
[[test]]
name = "class6"
regex = '[^\PN]+'
haystack = "ab"
matches = [[2, 5]]
[[test]]
name = "class7"
regex = '\p{Lu}+'
haystack = "ΛΘΓΔα"
matches = [[0, 8]]
[[test]]
name = "class8"
regex = '\p{Lu}+'
haystack = "ΛΘΓΔα"
matches = [[0, 10]]
case-insensitive = true
[[test]]
name = "class9"
regex = '\pL+'
haystack = "ΛΘΓΔα"
matches = [[0, 10]]
[[test]]
name = "class10"
regex = '\p{Ll}+'
haystack = "ΛΘΓΔα"
matches = [[8, 10]]
# Unicode aware "Perl" character classes.
[[test]]
name = "perl1"
regex = '\w+'
haystack = "dδd"
matches = [[0, 4]]
[[test]]
name = "perl2"
regex = '\w+'
haystack = "⥡"
matches = []
[[test]]
name = "perl3"
regex = '\W+'
haystack = "⥡"
matches = [[0, 3]]
[[test]]
name = "perl4"
regex = '\d+'
haystack = "1२३9"
matches = [[0, 8]]
[[test]]
name = "perl5"
regex = '\d+'
haystack = "Ⅱ"
matches = []
[[test]]
name = "perl6"
regex = '\D+'
haystack = "Ⅱ"
matches = [[0, 3]]
[[test]]
name = "perl7"
regex = '\s+'
haystack = ""
matches = [[0, 3]]
[[test]]
name = "perl8"
regex = '\s+'
haystack = "☃"
matches = []
[[test]]
name = "perl9"
regex = '\S+'
haystack = "☃"
matches = [[0, 3]]
# Specific tests for Unicode general category classes.
[[test]]
name = "class-gencat1"
regex = '\p{Cased_Letter}'
haystack = ""
matches = [[0, 3]]
[[test]]
name = "class-gencat2"
regex = '\p{Close_Punctuation}'
haystack = ""
matches = [[0, 3]]
[[test]]
name = "class-gencat3"
regex = '\p{Connector_Punctuation}'
haystack = "⁀"
matches = [[0, 3]]
[[test]]
name = "class-gencat4"
regex = '\p{Control}'
haystack = "\u009F"
matches = [[0, 2]]
[[test]]
name = "class-gencat5"
regex = '\p{Currency_Symbol}'
haystack = "£"
matches = [[0, 3]]
[[test]]
name = "class-gencat6"
regex = '\p{Dash_Punctuation}'
haystack = "〰"
matches = [[0, 3]]
[[test]]
name = "class-gencat7"
regex = '\p{Decimal_Number}'
haystack = "𑓙"
matches = [[0, 4]]
[[test]]
name = "class-gencat8"
regex = '\p{Enclosing_Mark}'
haystack = "\uA672"
matches = [[0, 3]]
[[test]]
name = "class-gencat9"
regex = '\p{Final_Punctuation}'
haystack = "⸡"
matches = [[0, 3]]
[[test]]
name = "class-gencat10"
regex = '\p{Format}'
haystack = "\U000E007F"
matches = [[0, 4]]
[[test]]
name = "class-gencat11"
regex = '\p{Initial_Punctuation}'
haystack = "⸜"
matches = [[0, 3]]
[[test]]
name = "class-gencat12"
regex = '\p{Letter}'
haystack = "Έ"
matches = [[0, 2]]
[[test]]
name = "class-gencat13"
regex = '\p{Letter_Number}'
haystack = "ↂ"
matches = [[0, 3]]
[[test]]
name = "class-gencat14"
regex = '\p{Line_Separator}'
haystack = "\u2028"
matches = [[0, 3]]
[[test]]
name = "class-gencat15"
regex = '\p{Lowercase_Letter}'
haystack = "ϛ"
matches = [[0, 2]]
[[test]]
name = "class-gencat16"
regex = '\p{Mark}'
haystack = "\U000E01EF"
matches = [[0, 4]]
[[test]]
name = "class-gencat17"
regex = '\p{Math}'
haystack = ""
matches = [[0, 3]]
[[test]]
name = "class-gencat18"
regex = '\p{Modifier_Letter}'
haystack = "𖭃"
matches = [[0, 4]]
[[test]]
name = "class-gencat19"
regex = '\p{Modifier_Symbol}'
haystack = "🏿"
matches = [[0, 4]]
[[test]]
name = "class-gencat20"
regex = '\p{Nonspacing_Mark}'
haystack = "\U0001E94A"
matches = [[0, 4]]
[[test]]
name = "class-gencat21"
regex = '\p{Number}'
haystack = "⓿"
matches = [[0, 3]]
[[test]]
name = "class-gencat22"
regex = '\p{Open_Punctuation}'
haystack = "⦅"
matches = [[0, 3]]
[[test]]
name = "class-gencat23"
regex = '\p{Other}'
haystack = "\u0BC9"
matches = [[0, 3]]
[[test]]
name = "class-gencat24"
regex = '\p{Other_Letter}'
haystack = "ꓷ"
matches = [[0, 3]]
[[test]]
name = "class-gencat25"
regex = '\p{Other_Number}'
haystack = "㉏"
matches = [[0, 3]]
[[test]]
name = "class-gencat26"
regex = '\p{Other_Punctuation}'
haystack = "𞥞"
matches = [[0, 4]]
[[test]]
name = "class-gencat27"
regex = '\p{Other_Symbol}'
haystack = "⅌"
matches = [[0, 3]]
[[test]]
name = "class-gencat28"
regex = '\p{Paragraph_Separator}'
haystack = "\u2029"
matches = [[0, 3]]
[[test]]
name = "class-gencat29"
regex = '\p{Private_Use}'
haystack = "\U0010FFFD"
matches = [[0, 4]]
[[test]]
name = "class-gencat30"
regex = '\p{Punctuation}'
haystack = "𑁍"
matches = [[0, 4]]
[[test]]
name = "class-gencat31"
regex = '\p{Separator}'
haystack = "\u3000"
matches = [[0, 3]]
[[test]]
name = "class-gencat32"
regex = '\p{Space_Separator}'
haystack = "\u205F"
matches = [[0, 3]]
[[test]]
name = "class-gencat33"
regex = '\p{Spacing_Mark}'
haystack = "\U00016F7E"
matches = [[0, 4]]
[[test]]
name = "class-gencat34"
regex = '\p{Symbol}'
haystack = "⯈"
matches = [[0, 3]]
[[test]]
name = "class-gencat35"
regex = '\p{Titlecase_Letter}'
haystack = "ῼ"
matches = [[0, 3]]
[[test]]
name = "class-gencat36"
regex = '\p{Unassigned}'
haystack = "\U0010FFFF"
matches = [[0, 4]]
[[test]]
name = "class-gencat37"
regex = '\p{Uppercase_Letter}'
haystack = "Ꝋ"
matches = [[0, 3]]
# Tests for Unicode emoji properties.
[[test]]
name = "class-emoji1"
regex = '\p{Emoji}'
haystack = "\u23E9"
matches = [[0, 3]]
[[test]]
name = "class-emoji2"
regex = '\p{emoji}'
haystack = "\U0001F21A"
matches = [[0, 4]]
[[test]]
name = "class-emoji3"
regex = '\p{extendedpictographic}'
haystack = "\U0001FA6E"
matches = [[0, 4]]
[[test]]
name = "class-emoji4"
regex = '\p{extendedpictographic}'
haystack = "\U0001FFFD"
matches = [[0, 4]]
# Tests for Unicode grapheme cluster properties.
[[test]]
name = "class-gcb1"
regex = '\p{grapheme_cluster_break=prepend}'
haystack = "\U00011D46"
matches = [[0, 4]]
[[test]]
name = "class-gcb2"
regex = '\p{gcb=regional_indicator}'
haystack = "\U0001F1E6"
matches = [[0, 4]]
[[test]]
name = "class-gcb3"
regex = '\p{gcb=ri}'
haystack = "\U0001F1E7"
matches = [[0, 4]]
[[test]]
name = "class-gcb4"
regex = '\p{regionalindicator}'
haystack = "\U0001F1FF"
matches = [[0, 4]]
[[test]]
name = "class-gcb5"
regex = '\p{gcb=lvt}'
haystack = "\uC989"
matches = [[0, 3]]
[[test]]
name = "class-gcb6"
regex = '\p{gcb=zwj}'
haystack = "\u200D"
matches = [[0, 3]]
# Tests for Unicode word boundary properties.
[[test]]
name = "class-word-break1"
regex = '\p{word_break=Hebrew_Letter}'
haystack = "\uFB46"
matches = [[0, 3]]
[[test]]
name = "class-word-break2"
regex = '\p{wb=hebrewletter}'
haystack = "\uFB46"
matches = [[0, 3]]
[[test]]
name = "class-word-break3"
regex = '\p{wb=ExtendNumLet}'
haystack = "\uFF3F"
matches = [[0, 3]]
[[test]]
name = "class-word-break4"
regex = '\p{wb=WSegSpace}'
haystack = "\u3000"
matches = [[0, 3]]
[[test]]
name = "class-word-break5"
regex = '\p{wb=numeric}'
haystack = "\U0001E950"
matches = [[0, 4]]
# Tests for Unicode sentence boundary properties.
[[test]]
name = "class-sentence-break1"
regex = '\p{sentence_break=Lower}'
haystack = "\u0469"
matches = [[0, 2]]
[[test]]
name = "class-sentence-break2"
regex = '\p{sb=lower}'
haystack = "\u0469"
matches = [[0, 2]]
[[test]]
name = "class-sentence-break3"
regex = '\p{sb=Close}'
haystack = "\uFF60"
matches = [[0, 3]]
[[test]]
name = "class-sentence-break4"
regex = '\p{sb=Close}'
haystack = "\U0001F677"
matches = [[0, 4]]
[[test]]
name = "class-sentence-break5"
regex = '\p{sb=SContinue}'
haystack = "\uFF64"
matches = [[0, 3]]

399
vendor/regex/testdata/utf8.toml vendored Normal file
View File

@@ -0,0 +1,399 @@
# These test the UTF-8 modes expose by regex-automata. Namely, when utf8 is
# true, then we promise that the haystack is valid UTF-8. (Otherwise behavior
# is unspecified.) This also corresponds to building the regex engine with the
# following two guarantees:
#
# 1) For any non-empty match reported, its span is guaranteed to correspond to
# valid UTF-8.
# 2) All empty or zero-width matches reported must never split a UTF-8
# encoded codepoint. If the haystack has invalid UTF-8, then this results in
# unspecified behavior.
#
# The (2) is in particular what we focus our testing on since (1) is generally
# guaranteed by regex-syntax's AST-to-HIR translator and is well tested there.
# The thing with (2) is that it can't be described in the HIR, so the regex
# engines have to handle that case. Thus, we test it here.
#
# Note that it is possible to build a regex that has property (1) but not
# (2), and vice versa. This is done by building the HIR with 'utf8=true' but
# building the Thompson NFA with 'utf8=false'. We don't test that here because
# the harness doesn't expose a way to enable or disable UTF-8 mode with that
# granularity. Instead, those combinations are lightly tested via doc examples.
# That's not to say that (1) without (2) is uncommon. Indeed, ripgrep uses it
# because it cannot guarantee that its haystack is valid UTF-8.
# This tests that an empty regex doesn't split a codepoint.
[[test]]
name = "empty-utf8yes"
regex = ''
haystack = '☃'
matches = [[0, 0], [3, 3]]
unicode = true
utf8 = true
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8yes-overlapping"
regex = ''
haystack = '☃'
matches = [[0, 0], [3, 3]]
unicode = true
utf8 = true
match-kind = "all"
search-kind = "overlapping"
# This tests that an empty regex DOES split a codepoint when utf=false.
[[test]]
name = "empty-utf8no"
regex = ''
haystack = '☃'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
unicode = true
utf8 = false
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8no-overlapping"
regex = ''
haystack = '☃'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
unicode = true
utf8 = false
match-kind = "all"
search-kind = "overlapping"
# This tests that an empty regex doesn't split a codepoint, even if we give
# it bounds entirely within the codepoint.
#
# This is one of the trickier cases and is what motivated the current UTF-8
# mode design. In particular, at one point, this test failed the 'is_match'
# variant of the test but not 'find'. This is because the 'is_match' code path
# is specifically optimized for "was a match found" rather than "where is the
# match." In the former case, you don't really care about the empty-vs-non-empty
# matches, and thus, the codepoint splitting filtering logic wasn't getting
# applied. (In multiple ways across multiple regex engines.) In this way, you
# can wind up with a situation where 'is_match' says "yes," but 'find' says,
# "I didn't find anything." Which is... not great.
#
# I could have decided to say that providing boundaries that themselves split
# a codepoint would have unspecified behavior. But I couldn't quite convince
# myself that such boundaries were the only way to get an inconsistency between
# 'is_match' and 'find'.
#
# Note that I also tried to come up with a test like this that fails without
# using `bounds`. Specifically, a test where 'is_match' and 'find' disagree.
# But I couldn't do it, and I'm tempted to conclude it is impossible. The
# fundamental problem is that you need to simultaneously produce an empty match
# that splits a codepoint while *not* matching before or after the codepoint.
[[test]]
name = "empty-utf8yes-bounds"
regex = ''
haystack = '𝛃'
bounds = [1, 3]
matches = []
unicode = true
utf8 = true
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8yes-bounds-overlapping"
regex = ''
haystack = '𝛃'
bounds = [1, 3]
matches = []
unicode = true
utf8 = true
match-kind = "all"
search-kind = "overlapping"
# This tests that an empty regex splits a codepoint when the bounds are
# entirely within the codepoint.
[[test]]
name = "empty-utf8no-bounds"
regex = ''
haystack = '𝛃'
bounds = [1, 3]
matches = [[1, 1], [2, 2], [3, 3]]
unicode = true
utf8 = false
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8no-bounds-overlapping"
regex = ''
haystack = '𝛃'
bounds = [1, 3]
matches = [[1, 1], [2, 2], [3, 3]]
unicode = true
utf8 = false
match-kind = "all"
search-kind = "overlapping"
# In this test, we anchor the search. Since the start position is also a UTF-8
# boundary, we get a match.
[[test]]
name = "empty-utf8yes-anchored"
regex = ''
haystack = '𝛃'
matches = [[0, 0]]
anchored = true
unicode = true
utf8 = true
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8yes-anchored-overlapping"
regex = ''
haystack = '𝛃'
matches = [[0, 0]]
anchored = true
unicode = true
utf8 = true
match-kind = "all"
search-kind = "overlapping"
# Same as above, except with UTF-8 mode disabled. It almost doesn't change the
# result, except for the fact that since this is an anchored search and we
# always find all matches, the test harness will keep reporting matches until
# none are found. Because it's anchored, matches will be reported so long as
# they are directly adjacent. Since with UTF-8 mode the next anchored search
# after the match at [0, 0] fails, iteration stops (and doesn't find the last
# match at [4, 4]).
[[test]]
name = "empty-utf8no-anchored"
regex = ''
haystack = '𝛃'
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
anchored = true
unicode = true
utf8 = false
# Tests the overlapping case of the above.
#
# Note that overlapping anchored searches are a little weird, and it's not
# totally clear what their semantics ought to be. For now, we just test the
# current behavior of our test shim that implements overlapping search. (This
# is one of the reasons why we don't really expose regex-level overlapping
# searches.)
[[test]]
name = "empty-utf8no-anchored-overlapping"
regex = ''
haystack = '𝛃'
matches = [[0, 0]]
anchored = true
unicode = true
utf8 = false
match-kind = "all"
search-kind = "overlapping"
# In this test, we anchor the search, but also set bounds. The bounds start the
# search in the middle of a codepoint, so there should never be a match.
[[test]]
name = "empty-utf8yes-anchored-bounds"
regex = ''
haystack = '𝛃'
matches = []
bounds = [1, 3]
anchored = true
unicode = true
utf8 = true
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8yes-anchored-bounds-overlapping"
regex = ''
haystack = '𝛃'
matches = []
bounds = [1, 3]
anchored = true
unicode = true
utf8 = true
match-kind = "all"
search-kind = "overlapping"
# Same as above, except with UTF-8 mode disabled. Without UTF-8 mode enabled,
# matching within a codepoint is allowed. And remember, as in the anchored test
# above with UTF-8 mode disabled, iteration will report all adjacent matches.
# The matches at [0, 0] and [4, 4] are not included because of the bounds of
# the search.
[[test]]
name = "empty-utf8no-anchored-bounds"
regex = ''
haystack = '𝛃'
bounds = [1, 3]
matches = [[1, 1], [2, 2], [3, 3]]
anchored = true
unicode = true
utf8 = false
# Tests the overlapping case of the above.
#
# Note that overlapping anchored searches are a little weird, and it's not
# totally clear what their semantics ought to be. For now, we just test the
# current behavior of our test shim that implements overlapping search. (This
# is one of the reasons why we don't really expose regex-level overlapping
# searches.)
[[test]]
name = "empty-utf8no-anchored-bounds-overlapping"
regex = ''
haystack = '𝛃'
bounds = [1, 3]
matches = [[1, 1]]
anchored = true
unicode = true
utf8 = false
match-kind = "all"
search-kind = "overlapping"
# This tests that we find the match at the end of the string when the bounds
# exclude the first match.
[[test]]
name = "empty-utf8yes-startbound"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = [[4, 4]]
unicode = true
utf8 = true
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8yes-startbound-overlapping"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = [[4, 4]]
unicode = true
utf8 = true
match-kind = "all"
search-kind = "overlapping"
# Same as above, except since UTF-8 mode is disabled, we also find the matches
# inbetween that split the codepoint.
[[test]]
name = "empty-utf8no-startbound"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = [[1, 1], [2, 2], [3, 3], [4, 4]]
unicode = true
utf8 = false
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8no-startbound-overlapping"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = [[1, 1], [2, 2], [3, 3], [4, 4]]
unicode = true
utf8 = false
match-kind = "all"
search-kind = "overlapping"
# This tests that we don't find any matches in an anchored search, even when
# the bounds include a match (at the end).
[[test]]
name = "empty-utf8yes-anchored-startbound"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = []
anchored = true
unicode = true
utf8 = true
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8yes-anchored-startbound-overlapping"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = []
anchored = true
unicode = true
utf8 = true
match-kind = "all"
search-kind = "overlapping"
# Same as above, except since UTF-8 mode is disabled, we also find the matches
# inbetween that split the codepoint. Even though this is an anchored search,
# since the matches are adjacent, we find all of them.
[[test]]
name = "empty-utf8no-anchored-startbound"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = [[1, 1], [2, 2], [3, 3], [4, 4]]
anchored = true
unicode = true
utf8 = false
# Tests the overlapping case of the above.
#
# Note that overlapping anchored searches are a little weird, and it's not
# totally clear what their semantics ought to be. For now, we just test the
# current behavior of our test shim that implements overlapping search. (This
# is one of the reasons why we don't really expose regex-level overlapping
# searches.)
[[test]]
name = "empty-utf8no-anchored-startbound-overlapping"
regex = ''
haystack = '𝛃'
bounds = [1, 4]
matches = [[1, 1]]
anchored = true
unicode = true
utf8 = false
match-kind = "all"
search-kind = "overlapping"
# This tests that we find the match at the end of the haystack in UTF-8 mode
# when our bounds only include the empty string at the end of the haystack.
[[test]]
name = "empty-utf8yes-anchored-endbound"
regex = ''
haystack = '𝛃'
bounds = [4, 4]
matches = [[4, 4]]
anchored = true
unicode = true
utf8 = true
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8yes-anchored-endbound-overlapping"
regex = ''
haystack = '𝛃'
bounds = [4, 4]
matches = [[4, 4]]
anchored = true
unicode = true
utf8 = true
match-kind = "all"
search-kind = "overlapping"
# Same as above, but with UTF-8 mode disabled. Results remain the same since
# the only possible match does not split a codepoint.
[[test]]
name = "empty-utf8no-anchored-endbound"
regex = ''
haystack = '𝛃'
bounds = [4, 4]
matches = [[4, 4]]
anchored = true
unicode = true
utf8 = false
# Tests the overlapping case of the above.
[[test]]
name = "empty-utf8no-anchored-endbound-overlapping"
regex = ''
haystack = '𝛃'
bounds = [4, 4]
matches = [[4, 4]]
anchored = true
unicode = true
utf8 = false
match-kind = "all"
search-kind = "overlapping"

View File

@@ -0,0 +1,687 @@
# These tests are for the "special" word boundary assertions. That is,
# \b{start}, \b{end}, \b{start-half}, \b{end-half}. These are specialty
# assertions for more niche use cases, but hitting those cases without these
# assertions is difficult. For example, \b{start-half} and \b{end-half} are
# used to implement the -w/--word-regexp flag in a grep program.
# Tests for (?-u:\b{start})
[[test]]
name = "word-start-ascii-010"
regex = '\b{start}'
haystack = "a"
matches = [[0, 0]]
unicode = false
[[test]]
name = "word-start-ascii-020"
regex = '\b{start}'
haystack = "a "
matches = [[0, 0]]
unicode = false
[[test]]
name = "word-start-ascii-030"
regex = '\b{start}'
haystack = " a "
matches = [[1, 1]]
unicode = false
[[test]]
name = "word-start-ascii-040"
regex = '\b{start}'
haystack = ""
matches = []
unicode = false
[[test]]
name = "word-start-ascii-050"
regex = '\b{start}'
haystack = "ab"
matches = [[0, 0]]
unicode = false
[[test]]
name = "word-start-ascii-060"
regex = '\b{start}'
haystack = "𝛃"
matches = []
unicode = false
[[test]]
name = "word-start-ascii-060-bounds"
regex = '\b{start}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = false
[[test]]
name = "word-start-ascii-070"
regex = '\b{start}'
haystack = " 𝛃 "
matches = []
unicode = false
[[test]]
name = "word-start-ascii-080"
regex = '\b{start}'
haystack = "𝛃𐆀"
matches = []
unicode = false
[[test]]
name = "word-start-ascii-090"
regex = '\b{start}'
haystack = "𝛃b"
matches = [[4, 4]]
unicode = false
[[test]]
name = "word-start-ascii-110"
regex = '\b{start}'
haystack = "b𝛃"
matches = [[0, 0]]
unicode = false
# Tests for (?-u:\b{end})
[[test]]
name = "word-end-ascii-010"
regex = '\b{end}'
haystack = "a"
matches = [[1, 1]]
unicode = false
[[test]]
name = "word-end-ascii-020"
regex = '\b{end}'
haystack = "a "
matches = [[1, 1]]
unicode = false
[[test]]
name = "word-end-ascii-030"
regex = '\b{end}'
haystack = " a "
matches = [[2, 2]]
unicode = false
[[test]]
name = "word-end-ascii-040"
regex = '\b{end}'
haystack = ""
matches = []
unicode = false
[[test]]
name = "word-end-ascii-050"
regex = '\b{end}'
haystack = "ab"
matches = [[2, 2]]
unicode = false
[[test]]
name = "word-end-ascii-060"
regex = '\b{end}'
haystack = "𝛃"
matches = []
unicode = false
[[test]]
name = "word-end-ascii-060-bounds"
regex = '\b{end}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = false
[[test]]
name = "word-end-ascii-070"
regex = '\b{end}'
haystack = " 𝛃 "
matches = []
unicode = false
[[test]]
name = "word-end-ascii-080"
regex = '\b{end}'
haystack = "𝛃𐆀"
matches = []
unicode = false
[[test]]
name = "word-end-ascii-090"
regex = '\b{end}'
haystack = "𝛃b"
matches = [[5, 5]]
unicode = false
[[test]]
name = "word-end-ascii-110"
regex = '\b{end}'
haystack = "b𝛃"
matches = [[1, 1]]
unicode = false
# Tests for \b{start}
[[test]]
name = "word-start-unicode-010"
regex = '\b{start}'
haystack = "a"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-unicode-020"
regex = '\b{start}'
haystack = "a "
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-unicode-030"
regex = '\b{start}'
haystack = " a "
matches = [[1, 1]]
unicode = true
[[test]]
name = "word-start-unicode-040"
regex = '\b{start}'
haystack = ""
matches = []
unicode = true
[[test]]
name = "word-start-unicode-050"
regex = '\b{start}'
haystack = "ab"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-unicode-060"
regex = '\b{start}'
haystack = "𝛃"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-unicode-060-bounds"
regex = '\b{start}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = true
[[test]]
name = "word-start-unicode-070"
regex = '\b{start}'
haystack = " 𝛃 "
matches = [[1, 1]]
unicode = true
[[test]]
name = "word-start-unicode-080"
regex = '\b{start}'
haystack = "𝛃𐆀"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-unicode-090"
regex = '\b{start}'
haystack = "𝛃b"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-unicode-110"
regex = '\b{start}'
haystack = "b𝛃"
matches = [[0, 0]]
unicode = true
# Tests for \b{end}
[[test]]
name = "word-end-unicode-010"
regex = '\b{end}'
haystack = "a"
matches = [[1, 1]]
unicode = true
[[test]]
name = "word-end-unicode-020"
regex = '\b{end}'
haystack = "a "
matches = [[1, 1]]
unicode = true
[[test]]
name = "word-end-unicode-030"
regex = '\b{end}'
haystack = " a "
matches = [[2, 2]]
unicode = true
[[test]]
name = "word-end-unicode-040"
regex = '\b{end}'
haystack = ""
matches = []
unicode = true
[[test]]
name = "word-end-unicode-050"
regex = '\b{end}'
haystack = "ab"
matches = [[2, 2]]
unicode = true
[[test]]
name = "word-end-unicode-060"
regex = '\b{end}'
haystack = "𝛃"
matches = [[4, 4]]
unicode = true
[[test]]
name = "word-end-unicode-060-bounds"
regex = '\b{end}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = true
[[test]]
name = "word-end-unicode-070"
regex = '\b{end}'
haystack = " 𝛃 "
matches = [[5, 5]]
unicode = true
[[test]]
name = "word-end-unicode-080"
regex = '\b{end}'
haystack = "𝛃𐆀"
matches = [[4, 4]]
unicode = true
[[test]]
name = "word-end-unicode-090"
regex = '\b{end}'
haystack = "𝛃b"
matches = [[5, 5]]
unicode = true
[[test]]
name = "word-end-unicode-110"
regex = '\b{end}'
haystack = "b𝛃"
matches = [[5, 5]]
unicode = true
# Tests for (?-u:\b{start-half})
[[test]]
name = "word-start-half-ascii-010"
regex = '\b{start-half}'
haystack = "a"
matches = [[0, 0]]
unicode = false
[[test]]
name = "word-start-half-ascii-020"
regex = '\b{start-half}'
haystack = "a "
matches = [[0, 0], [2, 2]]
unicode = false
[[test]]
name = "word-start-half-ascii-030"
regex = '\b{start-half}'
haystack = " a "
matches = [[0, 0], [1, 1], [3, 3]]
unicode = false
[[test]]
name = "word-start-half-ascii-040"
regex = '\b{start-half}'
haystack = ""
matches = [[0, 0]]
unicode = false
[[test]]
name = "word-start-half-ascii-050"
regex = '\b{start-half}'
haystack = "ab"
matches = [[0, 0]]
unicode = false
[[test]]
name = "word-start-half-ascii-060"
regex = '\b{start-half}'
haystack = "𝛃"
matches = [[0, 0], [4, 4]]
unicode = false
[[test]]
name = "word-start-half-ascii-060-noutf8"
regex = '\b{start-half}'
haystack = "𝛃"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
unicode = false
utf8 = false
[[test]]
name = "word-start-half-ascii-060-bounds"
regex = '\b{start-half}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = false
[[test]]
name = "word-start-half-ascii-070"
regex = '\b{start-half}'
haystack = " 𝛃 "
matches = [[0, 0], [1, 1], [5, 5], [6, 6]]
unicode = false
[[test]]
name = "word-start-half-ascii-080"
regex = '\b{start-half}'
haystack = "𝛃𐆀"
matches = [[0, 0], [4, 4], [8, 8]]
unicode = false
[[test]]
name = "word-start-half-ascii-090"
regex = '\b{start-half}'
haystack = "𝛃b"
matches = [[0, 0], [4, 4]]
unicode = false
[[test]]
name = "word-start-half-ascii-110"
regex = '\b{start-half}'
haystack = "b𝛃"
matches = [[0, 0], [5, 5]]
unicode = false
# Tests for (?-u:\b{end-half})
[[test]]
name = "word-end-half-ascii-010"
regex = '\b{end-half}'
haystack = "a"
matches = [[1, 1]]
unicode = false
[[test]]
name = "word-end-half-ascii-020"
regex = '\b{end-half}'
haystack = "a "
matches = [[1, 1], [2, 2]]
unicode = false
[[test]]
name = "word-end-half-ascii-030"
regex = '\b{end-half}'
haystack = " a "
matches = [[0, 0], [2, 2], [3, 3]]
unicode = false
[[test]]
name = "word-end-half-ascii-040"
regex = '\b{end-half}'
haystack = ""
matches = [[0, 0]]
unicode = false
[[test]]
name = "word-end-half-ascii-050"
regex = '\b{end-half}'
haystack = "ab"
matches = [[2, 2]]
unicode = false
[[test]]
name = "word-end-half-ascii-060"
regex = '\b{end-half}'
haystack = "𝛃"
matches = [[0, 0], [4, 4]]
unicode = false
[[test]]
name = "word-end-half-ascii-060-bounds"
regex = '\b{end-half}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = false
[[test]]
name = "word-end-half-ascii-070"
regex = '\b{end-half}'
haystack = " 𝛃 "
matches = [[0, 0], [1, 1], [5, 5], [6, 6]]
unicode = false
[[test]]
name = "word-end-half-ascii-080"
regex = '\b{end-half}'
haystack = "𝛃𐆀"
matches = [[0, 0], [4, 4], [8, 8]]
unicode = false
[[test]]
name = "word-end-half-ascii-090"
regex = '\b{end-half}'
haystack = "𝛃b"
matches = [[0, 0], [5, 5]]
unicode = false
[[test]]
name = "word-end-half-ascii-110"
regex = '\b{end-half}'
haystack = "b𝛃"
matches = [[1, 1], [5, 5]]
unicode = false
# Tests for \b{start-half}
[[test]]
name = "word-start-half-unicode-010"
regex = '\b{start-half}'
haystack = "a"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-half-unicode-020"
regex = '\b{start-half}'
haystack = "a "
matches = [[0, 0], [2, 2]]
unicode = true
[[test]]
name = "word-start-half-unicode-030"
regex = '\b{start-half}'
haystack = " a "
matches = [[0, 0], [1, 1], [3, 3]]
unicode = true
[[test]]
name = "word-start-half-unicode-040"
regex = '\b{start-half}'
haystack = ""
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-half-unicode-050"
regex = '\b{start-half}'
haystack = "ab"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-half-unicode-060"
regex = '\b{start-half}'
haystack = "𝛃"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-half-unicode-060-bounds"
regex = '\b{start-half}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = true
[[test]]
name = "word-start-half-unicode-070"
regex = '\b{start-half}'
haystack = " 𝛃 "
matches = [[0, 0], [1, 1], [6, 6]]
unicode = true
[[test]]
name = "word-start-half-unicode-080"
regex = '\b{start-half}'
haystack = "𝛃𐆀"
matches = [[0, 0], [8, 8]]
unicode = true
[[test]]
name = "word-start-half-unicode-090"
regex = '\b{start-half}'
haystack = "𝛃b"
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-start-half-unicode-110"
regex = '\b{start-half}'
haystack = "b𝛃"
matches = [[0, 0]]
unicode = true
# Tests for \b{end-half}
[[test]]
name = "word-end-half-unicode-010"
regex = '\b{end-half}'
haystack = "a"
matches = [[1, 1]]
unicode = true
[[test]]
name = "word-end-half-unicode-020"
regex = '\b{end-half}'
haystack = "a "
matches = [[1, 1], [2, 2]]
unicode = true
[[test]]
name = "word-end-half-unicode-030"
regex = '\b{end-half}'
haystack = " a "
matches = [[0, 0], [2, 2], [3, 3]]
unicode = true
[[test]]
name = "word-end-half-unicode-040"
regex = '\b{end-half}'
haystack = ""
matches = [[0, 0]]
unicode = true
[[test]]
name = "word-end-half-unicode-050"
regex = '\b{end-half}'
haystack = "ab"
matches = [[2, 2]]
unicode = true
[[test]]
name = "word-end-half-unicode-060"
regex = '\b{end-half}'
haystack = "𝛃"
matches = [[4, 4]]
unicode = true
[[test]]
name = "word-end-half-unicode-060-bounds"
regex = '\b{end-half}'
haystack = "𝛃"
bounds = [2, 3]
matches = []
unicode = true
[[test]]
name = "word-end-half-unicode-070"
regex = '\b{end-half}'
haystack = " 𝛃 "
matches = [[0, 0], [5, 5], [6, 6]]
unicode = true
[[test]]
name = "word-end-half-unicode-080"
regex = '\b{end-half}'
haystack = "𝛃𐆀"
matches = [[4, 4], [8, 8]]
unicode = true
[[test]]
name = "word-end-half-unicode-090"
regex = '\b{end-half}'
haystack = "𝛃b"
matches = [[5, 5]]
unicode = true
[[test]]
name = "word-end-half-unicode-110"
regex = '\b{end-half}'
haystack = "b𝛃"
matches = [[5, 5]]
unicode = true
# Specialty tests.
# Since \r is special cased in the start state computation (to deal with CRLF
# mode), this test ensures that the correct start state is computed when the
# pattern starts with a half word boundary assertion.
[[test]]
name = "word-start-half-ascii-carriage"
regex = '\b{start-half}[a-z]+'
haystack = 'ABC\rabc'
matches = [[4, 7]]
bounds = [4, 7]
unescape = true
# Since \n is also special cased in the start state computation, this test
# ensures that the correct start state is computed when the pattern starts with
# a half word boundary assertion.
[[test]]
name = "word-start-half-ascii-linefeed"
regex = '\b{start-half}[a-z]+'
haystack = 'ABC\nabc'
matches = [[4, 7]]
bounds = [4, 7]
unescape = true
# Like the carriage return test above, but with a custom line terminator.
[[test]]
name = "word-start-half-ascii-customlineterm"
regex = '\b{start-half}[a-z]+'
haystack = 'ABC!abc'
matches = [[4, 7]]
bounds = [4, 7]
unescape = true
line-terminator = '!'

781
vendor/regex/testdata/word-boundary.toml vendored Normal file
View File

@@ -0,0 +1,781 @@
# Some of these are cribbed from RE2's test suite.
# These test \b. Below are tests for \B.
[[test]]
name = "wb1"
regex = '\b'
haystack = ""
matches = []
unicode = false
[[test]]
name = "wb2"
regex = '\b'
haystack = "a"
matches = [[0, 0], [1, 1]]
unicode = false
[[test]]
name = "wb3"
regex = '\b'
haystack = "ab"
matches = [[0, 0], [2, 2]]
unicode = false
[[test]]
name = "wb4"
regex = '^\b'
haystack = "ab"
matches = [[0, 0]]
unicode = false
[[test]]
name = "wb5"
regex = '\b$'
haystack = "ab"
matches = [[2, 2]]
unicode = false
[[test]]
name = "wb6"
regex = '^\b$'
haystack = "ab"
matches = []
unicode = false
[[test]]
name = "wb7"
regex = '\bbar\b'
haystack = "nobar bar foo bar"
matches = [[6, 9], [14, 17]]
unicode = false
[[test]]
name = "wb8"
regex = 'a\b'
haystack = "faoa x"
matches = [[3, 4]]
unicode = false
[[test]]
name = "wb9"
regex = '\bbar'
haystack = "bar x"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb10"
regex = '\bbar'
haystack = "foo\nbar x"
matches = [[4, 7]]
unicode = false
[[test]]
name = "wb11"
regex = 'bar\b'
haystack = "foobar"
matches = [[3, 6]]
unicode = false
[[test]]
name = "wb12"
regex = 'bar\b'
haystack = "foobar\nxxx"
matches = [[3, 6]]
unicode = false
[[test]]
name = "wb13"
regex = '(?:foo|bar|[A-Z])\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb14"
regex = '(?:foo|bar|[A-Z])\b'
haystack = "foo\n"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb15"
regex = '\b(?:foo|bar|[A-Z])'
haystack = "foo"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb16"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "X"
matches = [[0, 1]]
unicode = false
[[test]]
name = "wb17"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "XY"
matches = []
unicode = false
[[test]]
name = "wb18"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "bar"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb19"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb20"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "foo\n"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb21"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "ffoo bbar N x"
matches = [[10, 11]]
unicode = false
[[test]]
name = "wb22"
regex = '\b(?:fo|foo)\b'
haystack = "fo"
matches = [[0, 2]]
unicode = false
[[test]]
name = "wb23"
regex = '\b(?:fo|foo)\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb24"
regex = '\b\b'
haystack = ""
matches = []
unicode = false
[[test]]
name = "wb25"
regex = '\b\b'
haystack = "a"
matches = [[0, 0], [1, 1]]
unicode = false
[[test]]
name = "wb26"
regex = '\b$'
haystack = ""
matches = []
unicode = false
[[test]]
name = "wb27"
regex = '\b$'
haystack = "x"
matches = [[1, 1]]
unicode = false
[[test]]
name = "wb28"
regex = '\b$'
haystack = "y x"
matches = [[3, 3]]
unicode = false
[[test]]
name = "wb29"
regex = '(?-u:\b).$'
haystack = "x"
matches = [[0, 1]]
[[test]]
name = "wb30"
regex = '^\b(?:fo|foo)\b'
haystack = "fo"
matches = [[0, 2]]
unicode = false
[[test]]
name = "wb31"
regex = '^\b(?:fo|foo)\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false
[[test]]
name = "wb32"
regex = '^\b$'
haystack = ""
matches = []
unicode = false
[[test]]
name = "wb33"
regex = '^\b$'
haystack = "x"
matches = []
unicode = false
[[test]]
name = "wb34"
regex = '^(?-u:\b).$'
haystack = "x"
matches = [[0, 1]]
[[test]]
name = "wb35"
regex = '^(?-u:\b).(?-u:\b)$'
haystack = "x"
matches = [[0, 1]]
[[test]]
name = "wb36"
regex = '^^^^^\b$$$$$'
haystack = ""
matches = []
unicode = false
[[test]]
name = "wb37"
regex = '^^^^^(?-u:\b).$$$$$'
haystack = "x"
matches = [[0, 1]]
[[test]]
name = "wb38"
regex = '^^^^^\b$$$$$'
haystack = "x"
matches = []
unicode = false
[[test]]
name = "wb39"
regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$'
haystack = "x"
matches = [[0, 1]]
[[test]]
name = "wb40"
regex = '(?-u:\b).+(?-u:\b)'
haystack = "$$abc$$"
matches = [[2, 5]]
[[test]]
name = "wb41"
regex = '\b'
haystack = "a b c"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
unicode = false
[[test]]
name = "wb42"
regex = '\bfoo\b'
haystack = "zzz foo zzz"
matches = [[4, 7]]
unicode = false
[[test]]
name = "wb43"
regex = '\b^'
haystack = "ab"
matches = [[0, 0]]
unicode = false
[[test]]
name = "wb44"
regex = '$\b'
haystack = "ab"
matches = [[2, 2]]
unicode = false
# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we
# have to disable it for most of these tests. This is because \B can match at
# non-UTF-8 boundaries.
[[test]]
name = "nb1"
regex = '\Bfoo\B'
haystack = "n foo xfoox that"
matches = [[7, 10]]
unicode = false
utf8 = false
[[test]]
name = "nb2"
regex = 'a\B'
haystack = "faoa x"
matches = [[1, 2]]
unicode = false
utf8 = false
[[test]]
name = "nb3"
regex = '\Bbar'
haystack = "bar x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb4"
regex = '\Bbar'
haystack = "foo\nbar x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb5"
regex = 'bar\B'
haystack = "foobar"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb6"
regex = 'bar\B'
haystack = "foobar\nxxx"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb7"
regex = '(?:foo|bar|[A-Z])\B'
haystack = "foox"
matches = [[0, 3]]
unicode = false
utf8 = false
[[test]]
name = "nb8"
regex = '(?:foo|bar|[A-Z])\B'
haystack = "foo\n"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb9"
regex = '\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false
[[test]]
name = "nb10"
regex = '\B'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb11"
regex = '\B(?:foo|bar|[A-Z])'
haystack = "foo"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb12"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "xXy"
matches = [[1, 2]]
unicode = false
utf8 = false
[[test]]
name = "nb13"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "XY"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb14"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "XYZ"
matches = [[1, 2]]
unicode = false
utf8 = false
[[test]]
name = "nb15"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "abara"
matches = [[1, 4]]
unicode = false
utf8 = false
[[test]]
name = "nb16"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "xfoo_"
matches = [[1, 4]]
unicode = false
utf8 = false
[[test]]
name = "nb17"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "xfoo\n"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb18"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "foo bar vNX"
matches = [[9, 10]]
unicode = false
utf8 = false
[[test]]
name = "nb19"
regex = '\B(?:fo|foo)\B'
haystack = "xfoo"
matches = [[1, 3]]
unicode = false
utf8 = false
[[test]]
name = "nb20"
regex = '\B(?:foo|fo)\B'
haystack = "xfooo"
matches = [[1, 4]]
unicode = false
utf8 = false
[[test]]
name = "nb21"
regex = '\B\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false
[[test]]
name = "nb22"
regex = '\B\B'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb23"
regex = '\B$'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false
[[test]]
name = "nb24"
regex = '\B$'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb25"
regex = '\B$'
haystack = "y x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb26"
regex = '\B.$'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb27"
regex = '^\B(?:fo|foo)\B'
haystack = "fo"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb28"
regex = '^\B(?:fo|foo)\B'
haystack = "fo"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb29"
regex = '^\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false
[[test]]
name = "nb30"
regex = '^\B'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb31"
regex = '^\B\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false
[[test]]
name = "nb32"
regex = '^\B\B'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb33"
regex = '^\B$'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false
[[test]]
name = "nb34"
regex = '^\B$'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb35"
regex = '^\B.$'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb36"
regex = '^\B.\B$'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb37"
regex = '^^^^^\B$$$$$'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false
[[test]]
name = "nb38"
regex = '^^^^^\B.$$$$$'
haystack = "x"
matches = []
unicode = false
utf8 = false
[[test]]
name = "nb39"
regex = '^^^^^\B$$$$$'
haystack = "x"
matches = []
unicode = false
utf8 = false
# unicode1* and unicode2* work for both Unicode and ASCII because all matches
# are reported as byte offsets, and « and » do not correspond to word
# boundaries at either the character or byte level.
[[test]]
name = "unicode1"
regex = '\bx\b'
haystack = "«x"
matches = [[2, 3]]
[[test]]
name = "unicode1-only-ascii"
regex = '\bx\b'
haystack = "«x"
matches = [[2, 3]]
unicode = false
[[test]]
name = "unicode2"
regex = '\bx\b'
haystack = "x»"
matches = [[0, 1]]
[[test]]
name = "unicode2-only-ascii"
regex = '\bx\b'
haystack = "x»"
matches = [[0, 1]]
unicode = false
# ASCII word boundaries are completely oblivious to Unicode characters, so
# even though β is a character, an ASCII \b treats it as a word boundary
# when it is adjacent to another ASCII character. (The ASCII \b only looks
# at the leading byte of β.) For Unicode \b, the tests are precisely inverted.
[[test]]
name = "unicode3"
regex = '\bx\b'
haystack = 'áxβ'
matches = []
[[test]]
name = "unicode3-only-ascii"
regex = '\bx\b'
haystack = 'áxβ'
matches = [[2, 3]]
unicode = false
[[test]]
name = "unicode4"
regex = '\Bx\B'
haystack = 'áxβ'
matches = [[2, 3]]
[[test]]
name = "unicode4-only-ascii"
regex = '\Bx\B'
haystack = 'áxβ'
matches = []
unicode = false
utf8 = false
# The same as above, but with \b instead of \B as a sanity check.
[[test]]
name = "unicode5"
regex = '\b'
haystack = "0\U0007EF5E"
matches = [[0, 0], [1, 1]]
[[test]]
name = "unicode5-only-ascii"
regex = '\b'
haystack = "0\U0007EF5E"
matches = [[0, 0], [1, 1]]
unicode = false
utf8 = false
[[test]]
name = "unicode5-noutf8"
regex = '\b'
haystack = '0\xFF\xFF\xFF\xFF'
matches = [[0, 0], [1, 1]]
unescape = true
utf8 = false
[[test]]
name = "unicode5-noutf8-only-ascii"
regex = '\b'
haystack = '0\xFF\xFF\xFF\xFF'
matches = [[0, 0], [1, 1]]
unescape = true
unicode = false
utf8 = false
# Weird special case to ensure that ASCII \B treats each individual code unit
# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary
# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the
# \w character class.)
[[test]]
name = "unicode5-not"
regex = '\B'
haystack = "0\U0007EF5E"
matches = [[5, 5]]
[[test]]
name = "unicode5-not-only-ascii"
regex = '\B'
haystack = "0\U0007EF5E"
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
unicode = false
utf8 = false
# This gets no matches since \B only matches in the presence of valid UTF-8
# when Unicode is enabled, even when UTF-8 mode is disabled.
[[test]]
name = "unicode5-not-noutf8"
regex = '\B'
haystack = '0\xFF\xFF\xFF\xFF'
matches = []
unescape = true
utf8 = false
# But this DOES get matches since \B in ASCII mode only looks at individual
# bytes.
[[test]]
name = "unicode5-not-noutf8-only-ascii"
regex = '\B'
haystack = '0\xFF\xFF\xFF\xFF'
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
unescape = true
unicode = false
utf8 = false
# Some tests of no particular significance.
[[test]]
name = "unicode6"
regex = '\b[0-9]+\b'
haystack = "foo 123 bar 456 quux 789"
matches = [[4, 7], [12, 15], [21, 24]]
[[test]]
name = "unicode7"
regex = '\b[0-9]+\b'
haystack = "foo 123 bar a456 quux 789"
matches = [[4, 7], [22, 25]]
[[test]]
name = "unicode8"
regex = '\b[0-9]+\b'
haystack = "foo 123 bar 456a quux 789"
matches = [[4, 7], [22, 25]]
# A variant of the problem described here:
# https://github.com/google/re2/blob/89567f5de5b23bb5ad0c26cbafc10bdc7389d1fa/re2/dfa.cc#L658-L667
[[test]]
name = "alt-with-assertion-repetition"
regex = '(?:\b|%)+'
haystack = "z%"
bounds = [1, 2]
anchored = true
matches = [[1, 1]]