1+ /**
2+ * INTERNAL. DO NOT USE.
3+ *
4+ * Provides predicates for resolving imports.
5+ */
6+
17private import python
28private import semmle.python.dataflow.new.DataFlow
39private import semmle.python.dataflow.new.internal.ImportStar
410private import semmle.python.dataflow.new.TypeTracker
11+ private import semmle.python.dataflow.new.internal.DataFlowPrivate
512
13+ /**
14+ * Python modules and the way imports are resolved are... complicated. Here's a crash course in how
15+ * it works, as well as some caveats to bear in mind when looking at the implementation in this
16+ * module.
17+ *
18+ * First, let's consider the humble `import` statement:
19+ * ```python
20+ * import foo
21+ * import bar.baz
22+ * import ham.eggs as spam
23+ * ```
24+ *
25+ * In the AST, all imports are aliased, as in the last import above. That is, `import foo` becomes
26+ * `import foo as foo`, and `import bar.baz` becomes `import bar as bar`. Note that `import` is
27+ * exclusively used to import modules -- if `eggs` is an attribute of the `ham` module (and not a
28+ * submodule of the `ham` package), then the third line above is an error.
29+ *
30+ * Next, we have the `from` statement. This one is a bit more complicated, but still has the same
31+ * aliasing desugaring as above applied to it. Thus, `from foo import bar` becomes
32+ * `from foo import bar as bar`.
33+ *
34+ * In general, `from foo import bar` can mean two different things:
35+ *
36+ * 1. If `foo` is a module, and `bar` is an attribute of `foo`, then `from foo import bar` imports
37+ * the attribute `bar` into the current module (binding it to the name `bar`).
38+ * 2. If `foo` is a package, and `bar` is already defined in `foo/__init__.py`,
39+ * that value will be imported. If it is not defined, and `bar` is a submodule of `foo`, then
40+ * `bar` is imported to `foo`, and the `bar` submodule imported.
41+ * Note: We don't currently model if the attribute is already defined in `__init__.py`
42+ * and always assume that the submodule will be used.
43+ *
44+ * Now, when it comes to how these imports are represented in the AST, things get a bit complicated.
45+ * First of all, both of the above forms of imports get mapped to the same kind of AST node:
46+ * `Import`. An `Import` node has a sequence of names, each of which is an `Alias` node. This `Alias`
47+ * node represents the `x as y` bit of each imported module.
48+ *
49+ * The same is true for `from` imports. So, how then do we distinguish between the two forms of
50+ * imports? The distinguishing feature is the left hand side of the `as` node. If the left hand side
51+ * is an `ImportExpr`, then it is a plain import. If it is an `ImportMember`, then it is a `from`
52+ * import. (And to confuse matters even more, this `ImportMember` contains another `ImportExpr` for
53+ * the bit between the `from` and `import` keywords.)
54+ *
55+ * Caveats:
56+ *
57+ * - A relative import of the form `from .foo import bar as baz` not only imports `bar` and binds it
58+ * to the name `baz`, but also imports `foo` and binds it to the name `foo`. This only happens with
59+ * relative imports. `from foo import bar as baz` only binds `bar` to `baz`.
60+ * - Modules may also be packages, so e.g. `import foo.bar` may import the `bar` submodule in the `foo`
61+ * package, or the `bar` subpackage of the `foo` package. The practical difference here is the name of
62+ * the module that is imported, as the package `foo.bar` will have the "name" `foo.bar.__init__`,
63+ * corresponding to the fact that the code that is executed is in the `__init__.py` file of the
64+ * `bar` subpackage.
65+ */
666module ImportResolution {
767 /**
868 * Holds if the module `m` defines a name `name` by assigning `defn` to it. This is an
969 * overapproximation, as `name` may not in fact be exported (e.g. by defining an `__all__` that does
1070 * not include `name`).
1171 */
72+ pragma [ nomagic]
1273 predicate module_export ( Module m , string name , DataFlow:: CfgNode defn ) {
1374 exists ( EssaVariable v |
1475 v .getName ( ) = name and
@@ -18,12 +79,223 @@ module ImportResolution {
1879 or
1980 defn .getNode ( ) = v .getDefinition ( ) .( ArgumentRefinement ) .getArgument ( )
2081 )
82+ or
83+ exists ( Alias a |
84+ defn .asExpr ( ) = [ a .getValue ( ) , a .getValue ( ) .( ImportMember ) .getModule ( ) ] and
85+ a .getAsname ( ) .( Name ) .getId ( ) = name and
86+ defn .getScope ( ) = m
87+ )
2188 }
2289
23- Module getModule ( DataFlow:: CfgNode node ) {
24- exists ( ModuleValue mv |
25- node .getNode ( ) .pointsTo ( mv ) and
26- result = mv .getScope ( )
90+ /**
91+ * Holds if the module `m` explicitly exports the name `name` by listing it in `__all__`. Only
92+ * handles simple cases where we can statically tell that this is the case.
93+ */
94+ private predicate all_mentions_name ( Module m , string name ) {
95+ exists ( DefinitionNode def , SequenceNode n |
96+ def .getValue ( ) = n and
97+ def .( NameNode ) .getId ( ) = "__all__" and
98+ def .getScope ( ) = m and
99+ any ( StrConst s | s .getText ( ) = name ) = n .getAnElement ( ) .getNode ( )
27100 )
28101 }
102+
103+ /**
104+ * Holds if the module `m` either does not set `__all__` (and so implicitly exports anything that
105+ * doesn't start with an underscore), or sets `__all__` in a way that's too complicated for us to
106+ * handle (in which case we _also_ pretend that it just exports all such names).
107+ */
108+ private predicate no_or_complicated_all ( Module m ) {
109+ // No mention of `__all__` in the module
110+ not exists ( DefinitionNode def | def .getScope ( ) = m and def .( NameNode ) .getId ( ) = "__all__" )
111+ or
112+ // `__all__` is set to a non-sequence value
113+ exists ( DefinitionNode def |
114+ def .( NameNode ) .getId ( ) = "__all__" and
115+ def .getScope ( ) = m and
116+ not def .getValue ( ) instanceof SequenceNode
117+ )
118+ or
119+ // `__all__` is used in some way that doesn't involve storing a value in it. This usually means
120+ // it is being mutated through `append` or `extend`, which we don't handle.
121+ exists ( NameNode n | n .getId ( ) = "__all__" and n .getScope ( ) = m and n .isLoad ( ) )
122+ }
123+
124+ private predicate potential_module_export ( Module m , string name ) {
125+ all_mentions_name ( m , name )
126+ or
127+ no_or_complicated_all ( m ) and
128+ (
129+ exists ( NameNode n | n .getId ( ) = name and n .getScope ( ) = m and name .charAt ( 0 ) != "_" )
130+ or
131+ exists ( Alias a | a .getAsname ( ) .( Name ) .getId ( ) = name and a .getValue ( ) .getScope ( ) = m )
132+ )
133+ }
134+
135+ /**
136+ * Holds if the module `reexporter` exports the module `reexported` under the name
137+ * `reexported_name`.
138+ */
139+ private predicate module_reexport ( Module reexporter , string reexported_name , Module reexported ) {
140+ exists ( DataFlow:: Node ref |
141+ ref = getImmediateModuleReference ( reexported ) and
142+ module_export ( reexporter , reexported_name , ref ) and
143+ potential_module_export ( reexporter , reexported_name )
144+ )
145+ }
146+
147+ /**
148+ * Gets a reference to `sys.modules`.
149+ */
150+ private DataFlow:: Node sys_modules_reference ( ) {
151+ result =
152+ any ( DataFlow:: AttrRef a |
153+ a .getAttributeName ( ) = "modules" and a .getObject ( ) .asExpr ( ) .( Name ) .getId ( ) = "sys"
154+ )
155+ }
156+
157+ /** Gets a module that may have been added to `sys.modules`. */
158+ private Module sys_modules_module_with_name ( string name ) {
159+ exists ( ControlFlowNode n , DataFlow:: Node mod |
160+ exists ( SubscriptNode sub |
161+ sub .getObject ( ) = sys_modules_reference ( ) .asCfgNode ( ) and
162+ sub .getIndex ( ) = n and
163+ n .getNode ( ) .( StrConst ) .getText ( ) = name and
164+ sub .( DefinitionNode ) .getValue ( ) = mod .asCfgNode ( ) and
165+ mod = getModuleReference ( result )
166+ )
167+ )
168+ }
169+
170+ Module getModuleImportedByImportStar ( ImportStar i ) {
171+ isPreferredModuleForName ( result .getFile ( ) , i .getImportedModuleName ( ) )
172+ }
173+
174+ /**
175+ * Gets a data-flow node that may be a reference to a module with the name `module_name`.
176+ *
177+ * This is a helper predicate for `getImmediateModuleReference`. It captures the fact that in an
178+ * import such as `import foo`,
179+ * - `foo` may simply be the name of a module, or
180+ * - `foo` may be the name of a package (in which case its name is actually `foo.__init__`), or
181+ * - `foo` may be a module name that has been added to `sys.modules` (in which case its actual name can
182+ * be anything, for instance `os.path` is either `posixpath` or `ntpath`).
183+ */
184+ private DataFlow:: Node getReferenceToModuleName ( string module_name ) {
185+ // Regular import statements, e.g.
186+ // import foo # implicitly `import foo as foo`
187+ // import foo as foo_alias
188+ exists ( Import i , Alias a | a = i .getAName ( ) |
189+ result .asExpr ( ) = a .getAsname ( ) and
190+ module_name = a .getValue ( ) .( ImportExpr ) .getImportedModuleName ( )
191+ )
192+ or
193+ // The module part of a `from ... import ...` statement, e.g. the `..foo.bar` in
194+ // from ..foo.bar import baz # ..foo.bar might point to, say, package.subpackage.foo.bar
195+ exists ( ImportMember i | result .asExpr ( ) = i .getModule ( ) |
196+ module_name = i .getModule ( ) .( ImportExpr ) .getImportedModuleName ( )
197+ )
198+ or
199+ // Modules (not attributes) imported via `from ... import ... statements`, e.g.
200+ // from foo.bar import baz # imports foo.bar.baz as baz
201+ // from foo.bar import baz as baz_alias # imports foo.bar.baz as baz_alias
202+ exists ( Import i , Alias a , ImportMember im | a = i .getAName ( ) and im = a .getValue ( ) |
203+ result .asExpr ( ) = a .getAsname ( ) and
204+ module_name = im .getModule ( ) .( ImportExpr ) .getImportedModuleName ( ) + "." + im .getName ( )
205+ )
206+ or
207+ // For parity with the points-to based solution, the `ImportExpr` and `ImportMember` bits of the
208+ // above cases should _also_ point to the right modules.
209+ result .asExpr ( ) = any ( ImportExpr i | i .getImportedModuleName ( ) = module_name )
210+ or
211+ result .asExpr ( ) =
212+ any ( ImportMember i |
213+ i .getModule ( ) .( ImportExpr ) .getImportedModuleName ( ) + "." + i .getName ( ) = module_name
214+ )
215+ }
216+
217+ /**
218+ * Gets a dataflow node that is an immediate reference to the module `m`.
219+ *
220+ * Because of attribute lookups, this is mutually recursive with `getModuleReference`.
221+ */
222+ DataFlow:: Node getImmediateModuleReference ( Module m ) {
223+ exists ( string module_name | result = getReferenceToModuleName ( module_name ) |
224+ // Depending on whether the referenced module is a package or not, we may need to add a
225+ // trailing `.__init__` to the module name.
226+ isPreferredModuleForName ( m .getFile ( ) , module_name + [ "" , ".__init__" ] )
227+ or
228+ // Module defined via `sys.modules`
229+ m = sys_modules_module_with_name ( module_name )
230+ )
231+ or
232+ // Reading an attribute on a module may return a submodule (or subpackage).
233+ exists ( DataFlow:: AttrRead ar , Module p , string attr_name |
234+ ar .accesses ( getModuleReference ( p ) , attr_name ) and
235+ result = ar
236+ |
237+ isPreferredModuleForName ( m .getFile ( ) , p .getPackageName ( ) + "." + attr_name + [ "" , ".__init__" ] )
238+ )
239+ or
240+ // This is also true for attributes that come from reexports.
241+ exists ( Module reexporter , string attr_name |
242+ result .( DataFlow:: AttrRead ) .accesses ( getModuleReference ( reexporter ) , attr_name ) and
243+ module_reexport ( reexporter , attr_name , m )
244+ )
245+ or
246+ // Submodules that are implicitly defined with relative imports of the form `from .foo import ...`.
247+ // In practice, we create a definition for each module in a package, even if it is not imported.
248+ exists ( string submodule , Module package |
249+ SsaSource:: init_module_submodule_defn ( result .asVar ( ) .getSourceVariable ( ) ,
250+ package .getEntryNode ( ) ) and
251+ isPreferredModuleForName ( m .getFile ( ) ,
252+ package .getPackageName ( ) + "." + submodule + [ "" , ".__init__" ] )
253+ )
254+ }
255+
256+ /** Join-order helper for `getModuleReference`. */
257+ pragma [ nomagic]
258+ private predicate module_reference_in_scope ( DataFlow:: Node node , Scope s , string name , Module m ) {
259+ node .getScope ( ) = s and
260+ node .asExpr ( ) .( Name ) .getId ( ) = name and
261+ pragma [ only_bind_into ] ( node ) = getImmediateModuleReference ( pragma [ only_bind_into ] ( m ) )
262+ }
263+
264+ /** Join-order helper for `getModuleReference`. */
265+ pragma [ nomagic]
266+ private predicate module_name_in_scope ( DataFlow:: Node node , Scope s , string name ) {
267+ node .getScope ( ) = s and
268+ exists ( Name n | n = node .asExpr ( ) |
269+ n .getId ( ) = name and
270+ pragma [ only_bind_into ] ( n ) .isUse ( )
271+ )
272+ }
273+
274+ /**
275+ * Gets a reference to the module `m` (including through certain kinds of local and global flow).
276+ */
277+ DataFlow:: Node getModuleReference ( Module m ) {
278+ // Immedate references to the module
279+ result = getImmediateModuleReference ( m )
280+ or
281+ // Flow (local or global) forward to a later reference to the module.
282+ exists ( DataFlow:: Node ref | ref = getModuleReference ( m ) |
283+ simpleLocalFlowStepForTypetracking ( ref , result )
284+ or
285+ exists ( DataFlow:: ModuleVariableNode mv |
286+ mv .getAWrite ( ) = ref and
287+ result = mv .getARead ( )
288+ )
289+ )
290+ or
291+ // A reference to a name that is bound to a module in an enclosing scope.
292+ exists ( DataFlow:: Node def , Scope def_scope , Scope use_scope , string name |
293+ module_reference_in_scope ( pragma [ only_bind_into ] ( def ) , pragma [ only_bind_into ] ( def_scope ) ,
294+ pragma [ only_bind_into ] ( name ) , pragma [ only_bind_into ] ( m ) ) and
295+ module_name_in_scope ( result , use_scope , name ) and
296+ use_scope .getEnclosingScope * ( ) = def_scope
297+ )
298+ }
299+
300+ Module getModule ( DataFlow:: CfgNode node ) { node = getModuleReference ( result ) }
29301}
0 commit comments