22 * Provides precicates for reasoning about bad tag filter vulnerabilities.
33 */
44
5- import performance.ReDoSUtil
5+ import performance.RegexpMatching
66
77/**
8- * A module for determining if a regexp matches a given string,
9- * and reasoning about which capture groups are filled by a given string .
8+ * Holds if the regexp `root` should be tested against `str`.
9+ * Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching` .
1010 */
11- private module RegexpMatching {
12- /**
13- * A class to test whether a regular expression matches a string.
14- * Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
15- * The result can afterwards be read from the `matches` predicate.
16- *
17- * Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
18- * The result is available in the `fillCaptureGroup` predicate.
19- */
20- abstract class MatchedRegExp extends RegExpTerm {
21- MatchedRegExp ( ) { this .isRootTerm ( ) }
22-
23- /**
24- * Holds if it should be tested whether this regular expression matches `str`.
25- *
26- * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
27- * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
28- * but if `ignorePrefix` is true, it will only match "foo".
29- */
30- predicate test ( string str , boolean ignorePrefix ) {
31- none ( ) // maybe overridden in subclasses
32- }
33-
34- /**
35- * Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
36- */
37- predicate testWithGroups ( string str , boolean ignorePrefix ) {
38- none ( ) // maybe overridden in subclasses
39- }
40-
41- /**
42- * Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
43- */
44- final predicate matches ( string str ) {
45- exists ( State state | state = getAState ( this , str .length ( ) - 1 , str , _) |
46- epsilonSucc * ( state ) = Accept ( _)
47- )
48- }
49-
50- /**
51- * Holds if matching `str` may fill capture group number `g`.
52- * Only holds if `str` is in the `testWithGroups` predicate.
53- */
54- final predicate fillsCaptureGroup ( string str , int g ) {
55- exists ( State s |
56- s = getAStateThatReachesAccept ( this , _, str , _) and
57- g = group ( s .getRepr ( ) )
58- )
59- }
60- }
61-
62- /**
63- * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
64- * The regular expression is modeled as a non-determistic finite automaton,
65- * the regular expression can therefore be in multiple states after matching a character.
66- *
67- * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
68- */
69- private State getAState ( MatchedRegExp reg , int i , string str , boolean ignorePrefix ) {
70- // start state, the -1 position before any chars have been matched
71- i = - 1 and
72- (
73- reg .test ( str , ignorePrefix )
74- or
75- reg .testWithGroups ( str , ignorePrefix )
76- ) and
77- result .getRepr ( ) .getRootTerm ( ) = reg and
78- isStartState ( result )
79- or
80- // recursive case
81- result = getAStateAfterMatching ( reg , _, str , i , _, ignorePrefix )
82- }
83-
84- /**
85- * Gets the next state after the `prev` state from `reg`.
86- * `prev` is the state after matching `fromIndex` chars in `str`,
87- * and the result is the state after matching `toIndex` chars in `str`.
88- *
89- * This predicate is used as a step relation in the forwards search (`getAState`),
90- * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
91- */
92- private State getAStateAfterMatching (
93- MatchedRegExp reg , State prev , string str , int toIndex , int fromIndex , boolean ignorePrefix
94- ) {
95- // the basic recursive case - outlined into a noopt helper to make performance work out.
96- result = getAStateAfterMatchingAux ( reg , prev , str , toIndex , fromIndex , ignorePrefix )
97- or
98- // we can skip past word boundaries if the next char is a non-word char.
99- fromIndex = toIndex and
100- prev .getRepr ( ) instanceof RegExpWordBoundary and
101- prev = getAState ( reg , toIndex , str , ignorePrefix ) and
102- after ( prev .getRepr ( ) ) = result and
103- str .charAt ( toIndex + 1 ) .regexpMatch ( "\\W" ) // \W matches any non-word char.
104- }
105-
106- pragma [ noopt]
107- private State getAStateAfterMatchingAux (
108- MatchedRegExp reg , State prev , string str , int toIndex , int fromIndex , boolean ignorePrefix
109- ) {
110- prev = getAState ( reg , fromIndex , str , ignorePrefix ) and
111- fromIndex = toIndex - 1 and
112- exists ( string char | char = str .charAt ( toIndex ) | specializedDeltaClosed ( prev , char , result ) ) and
113- not discardedPrefixStep ( prev , result , ignorePrefix )
114- }
115-
116- /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
117- private predicate discardedPrefixStep ( State prev , State next , boolean ignorePrefix ) {
118- prev = mkMatch ( any ( RegExpRoot r ) ) and
119- ignorePrefix = true and
120- next = prev
121- }
122-
123- // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
124- private predicate specializedDeltaClosed ( State prev , string char , State next ) {
125- deltaClosed ( prev , specializedGetAnInputSymbolMatching ( char ) , next )
126- }
127-
128- // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
129- pragma [ noinline]
130- private InputSymbol specializedGetAnInputSymbolMatching ( string char ) {
131- exists ( string s , MatchedRegExp r |
132- r .test ( s , _)
133- or
134- r .testWithGroups ( s , _)
135- |
136- char = s .charAt ( _)
137- ) and
138- result = getAnInputSymbolMatching ( char )
139- }
140-
141- /**
142- * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
143- * Starts with an accepting state as found by `getAState` and searches backwards
144- * to the start state through the reachable states (as found by `getAState`).
145- *
146- * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
147- * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
148- * The result state is therefore always on a valid path where `reg` accepts `str`.
149- *
150- * This predicate is only used to find which capture groups a regular expression has filled,
151- * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
152- */
153- private State getAStateThatReachesAccept (
154- MatchedRegExp reg , int i , string str , boolean ignorePrefix
155- ) {
156- // base case, reaches an accepting state from the last state in `getAState(..)`
157- reg .testWithGroups ( str , ignorePrefix ) and
158- i = str .length ( ) - 1 and
159- result = getAState ( reg , i , str , ignorePrefix ) and
160- epsilonSucc * ( result ) = Accept ( _)
11+ private predicate isBadTagFilterCandidate (
12+ RootTerm root , string str , boolean ignorePrefix , boolean testWithGroups
13+ ) {
14+ // the regexp must mention "<" and ">" explicitly.
15+ forall ( string angleBracket | angleBracket = [ "<" , ">" ] |
16+ any ( RegExpConstant term | term .getValue ( ) .matches ( "%" + angleBracket + "%" ) ) .getRootTerm ( ) =
17+ root
18+ ) and
19+ ignorePrefix = true and
20+ (
21+ str = [ "<!-- foo -->" , "<!-- foo --!>" , "<!- foo ->" , "<foo>" , "<script>" ] and
22+ testWithGroups = true
16123 or
162- // recursive case. `next` is the next state to be matched after matching `prev`.
163- // this predicate is doing a backwards search, so `prev` is the result we are looking for.
164- exists ( State next , State prev , int fromIndex , int toIndex |
165- next = getAStateThatReachesAccept ( reg , toIndex , str , ignorePrefix ) and
166- next = getAStateAfterMatching ( reg , prev , str , toIndex , fromIndex , ignorePrefix ) and
167- i = fromIndex and
168- result = prev
169- )
170- }
171-
172- /** Gets the capture group number that `term` belongs to. */
173- private int group ( RegExpTerm term ) {
174- exists ( RegExpGroup grp | grp .getNumber ( ) = result | term .getParent * ( ) = grp )
175- }
176- }
177-
178- /** A class to test whether a regular expression matches certain HTML tags. */
179- class HtmlMatchingRegExp extends RegexpMatching:: MatchedRegExp {
180- HtmlMatchingRegExp ( ) {
181- // the regexp must mention "<" and ">" explicitly.
182- forall ( string angleBracket | angleBracket = [ "<" , ">" ] |
183- any ( RegExpConstant term | term .getValue ( ) .matches ( "%" + angleBracket + "%" ) ) .getRootTerm ( ) =
184- this
185- )
186- }
187-
188- override predicate testWithGroups ( string str , boolean ignorePrefix ) {
189- ignorePrefix = true and
190- str = [ "<!-- foo -->" , "<!-- foo --!>" , "<!- foo ->" , "<foo>" , "<script>" ]
191- }
192-
193- override predicate test ( string str , boolean ignorePrefix ) {
194- ignorePrefix = true and
19524 str =
19625 [
19726 "<!-- foo -->" , "<!- foo ->" , "<!-- foo --!>" , "<!-- foo\n -->" , "<script>foo</script>" ,
@@ -200,12 +29,25 @@ class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
20029 "<script src='foo'></script>" , "<SCRIPT>foo</SCRIPT>" , "<script\tsrc=\"foo\"/>" ,
20130 "<script\tsrc='foo'></script>" , "<sCrIpT>foo</ScRiPt>" , "<script src=\"foo\">foo</script >" ,
20231 "<script src=\"foo\">foo</script foo=\"bar\">" , "<script src=\"foo\">foo</script\t\n bar>"
203- ]
204- }
32+ ] and
33+ testWithGroups = false
34+ )
20535}
20636
207- /** DEPRECATED: Alias for HtmlMatchingRegExp */
208- deprecated class HTMLMatchingRegExp = HtmlMatchingRegExp ;
37+ /**
38+ * A regexp that matches some string from the `isBadTagFilterCandidate` predicate.
39+ */
40+ class HtmlMatchingRegExp extends RootTerm {
41+ HtmlMatchingRegExp ( ) { RegexpMatching< isBadTagFilterCandidate / 4 > :: matches ( this , _) }
42+
43+ /** Holds if this regexp matched `str`, where `str` is one of the string from `isBadTagFilterCandidate`. */
44+ predicate matches ( string str ) { RegexpMatching< isBadTagFilterCandidate / 4 > :: matches ( this , str ) }
45+
46+ /** Holds if this regexp fills capture group `g' when matching `str', where `str` is one of the string from `isBadTagFilterCandidate`. */
47+ predicate fillsCaptureGroup ( string str , int g ) {
48+ RegexpMatching< isBadTagFilterCandidate / 4 > :: fillsCaptureGroup ( this , str , g )
49+ }
50+ }
20951
21052/**
21153 * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
0 commit comments