@@ -3,72 +3,6 @@ private import semmle.python.ApiGraphs
33// Need to import since frameworks can extend the abstract `RegExpInterpretation::Range`
44private import semmle.python.Frameworks
55private import semmle.python.Concepts as Concepts
6-
7- /**
8- * Gets the positional argument index containing the regular expression flags for the member of the
9- * `re` module with the name `name`.
10- */
11- private int re_member_flags_arg ( string name ) {
12- name = "compile" and result = 1
13- or
14- name = "search" and result = 2
15- or
16- name = "match" and result = 2
17- or
18- name = "split" and result = 3
19- or
20- name = "findall" and result = 2
21- or
22- name = "finditer" and result = 2
23- or
24- name = "sub" and result = 4
25- or
26- name = "subn" and result = 4
27- }
28-
29- /**
30- * Gets the names and corresponding API nodes of members of the `re` module that are likely to be
31- * methods taking regular expressions as arguments.
32- *
33- * This is a helper predicate that fixes a bad join order, and should not be inlined without checking
34- * that this is safe.
35- */
36- pragma [ nomagic]
37- private API:: Node relevant_re_member ( string name ) {
38- result = API:: moduleImport ( "re" ) .getMember ( name ) and
39- name != "escape"
40- }
41-
42- /**
43- * Holds if the expression `e` is used as a regex with the `re` module, with the regex-mode `mode` (if known).
44- * If regex mode is not known, `mode` will be `"None"`.
45- *
46- * This predicate has not done any data-flow tracking.
47- */
48- // TODO: This should only be used to get the `mode`, and nowhere else.
49- predicate used_as_regex_internal ( Expr e , string mode ) {
50- /* Call to re.xxx(regex, ... [mode]) */
51- exists ( DataFlow:: CallCfgNode call |
52- call instanceof Concepts:: RegexExecution and
53- e = call .( Concepts:: RegexExecution ) .getRegex ( ) .asExpr ( )
54- or
55- call .getArg ( 0 ) .asExpr ( ) = e and
56- call = relevant_re_member ( _) .getACall ( )
57- |
58- mode = "None"
59- or
60- exists ( DataFlow:: CallCfgNode callNode |
61- call = callNode and
62- mode =
63- mode_from_node ( [
64- callNode
65- .getArg ( re_member_flags_arg ( callNode .( DataFlow:: MethodCallNode ) .getMethodName ( ) ) ) ,
66- callNode .getArgByName ( "flags" )
67- ] )
68- )
69- )
70- }
71-
726private import regexp.internal.RegExpTracking as RegExpTracking
737private import semmle.python.Concepts
748private import semmle.python.regexp.RegexTreeView
@@ -81,49 +15,6 @@ RegExpTerm getTermForExecution(RegexExecution exec) {
8115 )
8216}
8317
84- /**
85- * Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
86- * that have multiple names, we pick the long-form name as a canonical representative.
87- */
88- private string canonical_name ( API:: Node flag ) {
89- result in [ "ASCII" , "IGNORECASE" , "LOCALE" , "UNICODE" , "MULTILINE" , "TEMPLATE" ] and
90- flag = API:: moduleImport ( "re" ) .getMember ( [ result , result .prefix ( 1 ) ] )
91- or
92- flag = API:: moduleImport ( "re" ) .getMember ( [ "DOTALL" , "S" ] ) and result = "DOTALL"
93- or
94- flag = API:: moduleImport ( "re" ) .getMember ( [ "VERBOSE" , "X" ] ) and result = "VERBOSE"
95- }
96-
97- /**
98- * A type tracker for regular expression flag names. Holds if the result is a node that may refer
99- * to the `re` flag with the canonical name `flag_name`
100- */
101- private DataFlow:: TypeTrackingNode re_flag_tracker ( string flag_name , DataFlow:: TypeTracker t ) {
102- t .start ( ) and
103- exists ( API:: Node flag | flag_name = canonical_name ( flag ) and result = flag .asSource ( ) )
104- or
105- exists ( BinaryExprNode binop , DataFlow:: Node operand |
106- operand .getALocalSource ( ) = re_flag_tracker ( flag_name , t .continue ( ) ) and
107- operand .asCfgNode ( ) = binop .getAnOperand ( ) and
108- ( binop .getOp ( ) instanceof BitOr or binop .getOp ( ) instanceof Add ) and
109- result .asCfgNode ( ) = binop
110- )
111- or
112- exists ( DataFlow:: TypeTracker t2 | result = re_flag_tracker ( flag_name , t2 ) .track ( t2 , t ) )
113- }
114-
115- /**
116- * A type tracker for regular expression flag names. Holds if the result is a node that may refer
117- * to the `re` flag with the canonical name `flag_name`
118- */
119- private DataFlow:: Node re_flag_tracker ( string flag_name ) {
120- re_flag_tracker ( flag_name , DataFlow:: TypeTracker:: end ( ) ) .flowsTo ( result )
121- }
122-
123- /** Gets a regular expression mode flag associated with the given data flow node. */
124- // TODO: Move this into a RegexFlag module, along with related code?
125- string mode_from_node ( DataFlow:: Node node ) { node = re_flag_tracker ( result ) }
126-
12718/** Provides a class for modeling regular expression interpretations. */
12819module RegExpInterpretation {
12920 /**
@@ -150,6 +41,102 @@ deprecated class RegexString extends Regex {
15041 RegexString ( ) { this = RegExpTracking:: regExpSource ( _) .asExpr ( ) }
15142}
15243
44+ /** Utility predicates for finding the mode of a regex based on where it's used. */
45+ private module FindRegexMode {
46+ // TODO: Movev this (and Regex) into a ParseRegExp file.
47+ /**
48+ * Gets the mode of the regex `regex` based on the context where it's used.
49+ * Does not find the mode if it's in a prefix inside the regex itself (see `Regex::getAMode`).
50+ */
51+ string getAMode ( Regex regex ) {
52+ exists ( DataFlow:: Node sink |
53+ sink = regex .getAUse ( ) and
54+ /* Call to re.xxx(regex, ... [mode]) */
55+ exists ( DataFlow:: CallCfgNode call |
56+ call instanceof Concepts:: RegexExecution and
57+ sink = call .( Concepts:: RegexExecution ) .getRegex ( )
58+ or
59+ call .getArg ( _) = sink and
60+ sink instanceof RegExpInterpretation:: Range
61+ |
62+ exists ( DataFlow:: CallCfgNode callNode |
63+ call = callNode and
64+ result =
65+ mode_from_node ( [
66+ callNode
67+ .getArg ( re_member_flags_arg ( callNode .( DataFlow:: MethodCallNode ) .getMethodName ( ) ) ) ,
68+ callNode .getArgByName ( "flags" )
69+ ] )
70+ )
71+ )
72+ )
73+ }
74+
75+ /**
76+ * Gets the positional argument index containing the regular expression flags for the member of the
77+ * `re` module with the name `name`.
78+ */
79+ private int re_member_flags_arg ( string name ) {
80+ name = "compile" and result = 1
81+ or
82+ name = "search" and result = 2
83+ or
84+ name = "match" and result = 2
85+ or
86+ name = "split" and result = 3
87+ or
88+ name = "findall" and result = 2
89+ or
90+ name = "finditer" and result = 2
91+ or
92+ name = "sub" and result = 4
93+ or
94+ name = "subn" and result = 4
95+ }
96+
97+ /**
98+ * Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
99+ * that have multiple names, we pick the long-form name as a canonical representative.
100+ */
101+ private string canonical_name ( API:: Node flag ) {
102+ result in [ "ASCII" , "IGNORECASE" , "LOCALE" , "UNICODE" , "MULTILINE" , "TEMPLATE" ] and
103+ flag = API:: moduleImport ( "re" ) .getMember ( [ result , result .prefix ( 1 ) ] )
104+ or
105+ flag = API:: moduleImport ( "re" ) .getMember ( [ "DOTALL" , "S" ] ) and result = "DOTALL"
106+ or
107+ flag = API:: moduleImport ( "re" ) .getMember ( [ "VERBOSE" , "X" ] ) and result = "VERBOSE"
108+ }
109+
110+ /**
111+ * A type tracker for regular expression flag names. Holds if the result is a node that may refer
112+ * to the `re` flag with the canonical name `flag_name`
113+ */
114+ private DataFlow:: TypeTrackingNode re_flag_tracker ( string flag_name , DataFlow:: TypeTracker t ) {
115+ t .start ( ) and
116+ exists ( API:: Node flag | flag_name = canonical_name ( flag ) and result = flag .asSource ( ) )
117+ or
118+ exists ( BinaryExprNode binop , DataFlow:: Node operand |
119+ operand .getALocalSource ( ) = re_flag_tracker ( flag_name , t .continue ( ) ) and
120+ operand .asCfgNode ( ) = binop .getAnOperand ( ) and
121+ ( binop .getOp ( ) instanceof BitOr or binop .getOp ( ) instanceof Add ) and
122+ result .asCfgNode ( ) = binop
123+ )
124+ or
125+ exists ( DataFlow:: TypeTracker t2 | result = re_flag_tracker ( flag_name , t2 ) .track ( t2 , t ) )
126+ }
127+
128+ /**
129+ * A type tracker for regular expression flag names. Holds if the result is a node that may refer
130+ * to the `re` flag with the canonical name `flag_name`
131+ */
132+ private DataFlow:: Node re_flag_tracker ( string flag_name ) {
133+ re_flag_tracker ( flag_name , DataFlow:: TypeTracker:: end ( ) ) .flowsTo ( result )
134+ }
135+
136+ /** Gets a regular expression mode flag associated with the given data flow node. */
137+ private string mode_from_node ( DataFlow:: Node node ) { node = re_flag_tracker ( result ) }
138+ }
139+
153140/** A StrConst used as a regular expression */
154141class Regex extends Expr {
155142 DataFlow:: Node sink ;
@@ -175,11 +162,7 @@ class Regex extends Expr {
175162 * VERBOSE
176163 */
177164 string getAMode ( ) {
178- exists ( string mode |
179- used_as_regex_internal ( sink .asExpr ( ) , mode ) and
180- result != "None" and
181- result = mode
182- )
165+ result = FindRegexMode:: getAMode ( this )
183166 or
184167 result = this .getModeFromPrefix ( )
185168 }
0 commit comments