@@ -113,9 +113,7 @@ class RegExpTerm extends Locatable, @regexpterm {
113113 /**
114114 * Holds if this is the root term of a regular expression.
115115 */
116- predicate isRootTerm ( ) {
117- not getParent ( ) instanceof RegExpTerm
118- }
116+ predicate isRootTerm ( ) { not getParent ( ) instanceof RegExpTerm }
119117
120118 /**
121119 * Gets the outermost term of this regular expression.
@@ -130,19 +128,15 @@ class RegExpTerm extends Locatable, @regexpterm {
130128 /**
131129 * Holds if this term occurs as part of a regular expression literal.
132130 */
133- predicate isPartOfRegExpLiteral ( ) {
134- exists ( getLiteral ( ) )
135- }
131+ predicate isPartOfRegExpLiteral ( ) { exists ( getLiteral ( ) ) }
136132
137133 /**
138134 * Holds if this term occurs as part of a string literal.
139135 *
140136 * This predicate holds regardless of whether the string literal is actually
141137 * used as a regular expression. See `isUsedAsRegExp`.
142138 */
143- predicate isPartOfStringLiteral ( ) {
144- getRootTerm ( ) .getParent ( ) instanceof StringLiteral
145- }
139+ predicate isPartOfStringLiteral ( ) { getRootTerm ( ) .getParent ( ) instanceof StringLiteral }
146140
147141 /**
148142 * Holds if this term is part of a regular expression literal, or a string literal
@@ -344,8 +338,7 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
344338 * ^
345339 * ```
346340 */
347- class RegExpCaret extends RegExpAnchor , @regexp_caret {
348- }
341+ class RegExpCaret extends RegExpAnchor , @regexp_caret { }
349342
350343/**
351344 * A dollar assertion `$` matching the end of a line.
@@ -356,8 +349,7 @@ class RegExpCaret extends RegExpAnchor, @regexp_caret {
356349 * $
357350 * ```
358351 */
359- class RegExpDollar extends RegExpAnchor , @regexp_dollar {
360- }
352+ class RegExpDollar extends RegExpAnchor , @regexp_dollar { }
361353
362354/**
363355 * A word boundary assertion.
@@ -940,3 +932,131 @@ private class StringRegExpPatternSource extends RegExpPatternSource {
940932
941933 override RegExpTerm getRegExpTerm ( ) { result = asExpr ( ) .( StringLiteral ) .asRegExp ( ) }
942934}
935+
936+ module RegExp {
937+ /** Gets the string `"?"` used to represent a regular expression whose flags are unknown. */
938+ string unknownFlag ( ) { result = "?" }
939+
940+ /** Holds if `flags` includes the `m` flag. */
941+ bindingset [ flags]
942+ predicate isMultiline ( string flags ) { flags .matches ( "%m%" ) }
943+
944+ /** Holds if `flags` includes the `g` flag. */
945+ bindingset [ flags]
946+ predicate isGlobal ( string flags ) { flags .matches ( "%g%" ) }
947+
948+ /** Holds if `flags` includes the `i` flag. */
949+ bindingset [ flags]
950+ predicate isIgnoreCase ( string flags ) { flags .matches ( "%i%" ) }
951+
952+ /** Holds if `flags` includes the `s` flag. */
953+ bindingset [ flags]
954+ predicate isDotAll ( string flags ) { flags .matches ( "%s%" ) }
955+
956+ /** Holds if `flags` includes the `m` flag or is the unknown flag `?`. */
957+ bindingset [ flags]
958+ predicate maybeMultiline ( string flags ) { flags = unknownFlag ( ) or isMultiline ( flags ) }
959+
960+ /** Holds if `flags` includes the `g` flag or is the unknown flag `?`. */
961+ bindingset [ flags]
962+ predicate maybeGlobal ( string flags ) { flags = unknownFlag ( ) or isGlobal ( flags ) }
963+
964+ /** Holds if `flags` includes the `i` flag or is the unknown flag `?`. */
965+ bindingset [ flags]
966+ predicate maybeIgnoreCase ( string flags ) { flags = unknownFlag ( ) or isIgnoreCase ( flags ) }
967+
968+ /** Holds if `flags` includes the `s` flag or is the unknown flag `?`. */
969+ bindingset [ flags]
970+ predicate maybeDotAll ( string flags ) { flags = unknownFlag ( ) or isDotAll ( flags ) }
971+
972+ /** Holds if `term` and all of its disjuncts are anchored on both ends. */
973+ predicate isFullyAnchoredTerm ( RegExpTerm term ) {
974+ exists ( RegExpSequence seq | term = seq |
975+ seq .getChild ( 0 ) instanceof RegExpCaret and
976+ seq .getLastChild ( ) instanceof RegExpDollar
977+ )
978+ or
979+ isFullyAnchoredTerm ( term .( RegExpGroup ) .getAChild ( ) )
980+ or
981+ isFullyAnchoredAlt ( term , term .getNumChild ( ) )
982+ }
983+
984+ /** Holds if the first `i` disjuncts of `term` are fully anchored. */
985+ private predicate isFullyAnchoredAlt ( RegExpAlt term , int i ) {
986+ isFullyAnchoredTerm ( term .getChild ( 0 ) ) and i = 1
987+ or
988+ isFullyAnchoredAlt ( term , i - 1 ) and
989+ isFullyAnchoredTerm ( term .getChild ( i - 1 ) )
990+ }
991+
992+ /**
993+ * Holds if `term` matches any character except for explicitly listed exceptions.
994+ *
995+ * For example, holds for `.`, `[^<>]`, or `\W`, but not for `[a-z]`, `\w`, or `[^\W\S]`.
996+ */
997+ predicate isWildcardLike ( RegExpTerm term ) {
998+ term instanceof RegExpDot
999+ or
1000+ term .( RegExpCharacterClassEscape ) .getValue ( ) .isUppercase ( )
1001+ or
1002+ // [^a-z]
1003+ exists ( RegExpCharacterClass cls | term = cls |
1004+ cls .isInverted ( ) and
1005+ not cls .getAChild ( ) .( RegExpCharacterClassEscape ) .getValue ( ) .isUppercase ( )
1006+ )
1007+ or
1008+ // [\W]
1009+ exists ( RegExpCharacterClass cls | term = cls |
1010+ not cls .isInverted ( ) and
1011+ cls .getAChild ( ) .( RegExpCharacterClassEscape ) .getValue ( ) .isUppercase ( )
1012+ )
1013+ }
1014+
1015+ /**
1016+ * Holds if `term` is a generic sanitizer for strings that match (if `outcome` is true)
1017+ * or strings that don't match (if `outcome` is false).
1018+ *
1019+ * Specifically, whitelisting regexps such as `^(foo|bar)$` sanitize matches in the true case.
1020+ * Inverted character classes such as `[^a-z]` or `\W` sanitize matches in the false case.
1021+ */
1022+ predicate isGenericRegExpSanitizer ( RegExpTerm term , boolean outcome ) {
1023+ term .isRootTerm ( ) and
1024+ (
1025+ outcome = true and
1026+ isFullyAnchoredTerm ( term ) and
1027+ not isWildcardLike ( term .getAChild * ( ) )
1028+ or
1029+ // Character set restrictions like `/[^a-z]/.test(x)` sanitize in the false case
1030+ outcome = false and
1031+ exists ( RegExpTerm root |
1032+ root = term
1033+ or
1034+ root = term .( RegExpGroup ) .getAChild ( )
1035+ |
1036+ isWildcardLike ( root )
1037+ or
1038+ isWildcardLike ( root .( RegExpAlt ) .getAChild ( ) )
1039+ )
1040+ )
1041+ }
1042+
1043+ /**
1044+ * Gets the AST of a regular expression object that can flow to `node`.
1045+ */
1046+ RegExpTerm getRegExpObjectFromNode ( DataFlow:: Node node ) {
1047+ exists ( DataFlow:: RegExpCreationNode regexp |
1048+ regexp .getAReference ( ) .flowsTo ( node ) and
1049+ result = regexp .getRoot ( )
1050+ )
1051+ }
1052+
1053+ /**
1054+ * Gets the AST of a regular expression that can flow to `node`,
1055+ * including `RegExp` objects as well as strings interpreted as regular expressions.
1056+ */
1057+ RegExpTerm getRegExpFromNode ( DataFlow:: Node node ) {
1058+ result = getRegExpObjectFromNode ( node )
1059+ or
1060+ result = node .asExpr ( ) .( StringLiteral ) .asRegExp ( )
1061+ }
1062+ }
0 commit comments