KiBot/kibot/mcpyrate/compiler.py

545 lines
24 KiB
Python

# -*- coding: utf-8; -*-
"""Compile macro-enabled code.
This is used by the import hooks in `mcpyrate.importer`.
Functions specific to multi-phase compilation live in `mcpyrate.multiphase`.
This module orchestrates all other transformations `mcpyrate` performs when
a module is imported.
We also provide a public API to compile and run macro-enabled code at run time.
"""
__all__ = ["expand", "compile",
"singlephase_expand",
"run", "create_module"]
import ast
import builtins
import importlib.util
import sys
from types import ModuleType, CodeType
from .dialects import DialectExpander
from .expander import find_macros, expand_macros
from .markers import check_no_markers_remaining
from . import multiphase
from .unparser import unparse
from .utils import gensym, getdocstring
def expand(source, filename, optimize=-1, self_module=None):
"""Expand macros and dialects, accounting for multi-phase compilation if needed.
This is the top-level entry point that orchestrates all the transformations
`mcpyrate` performs when a module is imported.
`source`: `str` or `bytes` containing Python source code, an `ast.Module`,
or a `list` of statement AST nodes.
If `source` is a `list`, it is automatically wrapped into
the `body` of a new `ast.Module`.
We always support macros, dialect AST transforms, dialect AST
postprocessors, and multi-phase compilation.
Obviously, we support dialect source transforms only when
`source` is an `str` or a `bytes`.
`filename`: Full path to the `.py` file being compiled.
`optimize`: Passed to Python's built-in `compile` function, as well as to
the multi-phase compiler. The multi-phase compiler uses the
`optimize` setting for the temporary higher-phase modules.
`self_module`: Absolute dotted module name of the module being compiled.
Needed for modules that request multi-phase compilation.
Ignored in single-phase compilation.
In multi-phase compilation, used for temporarily injecting
the temporary, higher-phase modules into `sys.modules`,
as well as resolving `__self__` in self-macro-imports
(`from __self__ import macros, ...`).
Return value is the final expanded AST, ready for Python's built-in `compile`.
If you don't care about the expanded AST, and just want the final bytecode,
see `mcpyrate.compiler.compile`, which performs both steps. It takes the
same parameters as `expand`.
If you just want to run a macro-enabled code snippet you generated
at run time, see `run`.
**Notes**
`ast.Module` is just the top-level AST node type produced by
`ast.parse(..., mode="exec")`, which strictly speaking has nothing
to do with modules (`types.ModuleType`, the type of thing that lives
in `sys.modules`). Rather, `ast.Module` just represents a sequence of
statements. The common factor is that the source code for a module is
a sequence of statements.
`mcpyrate` stirs this picture up a bit. We always parse in `"exec"` mode.
Because macro bindings are established by macro-imports, the principle of
least astonishment requires that macros are looked up in a module - and
that module must be (or at least will have to become) available in
`sys.modules`.
Our `run` function uses an actual module as the namespace in which to run
the code, instead of using a bare dict like the built-in `exec` does.
This unifies the handling of run-time compiled and imported code, since
importing a source file always produces a module.
What this means in practice is that in `mcpyrate`, to run a code snippet
generated at run time, there has to be a module to run the code in. The
`run` function will auto-create one if needed, but in case you want more
control, you can create one explicitly with `create_module`, and then pass
that in to `run`.
Note this implies that, if you wish, you can first define one module at
run time that defines some macros, and then in another run-time code snippet,
import those macros from that dynamically generated module. Just use
`create_module` to create the first module, to give it a known dotted name
in `sys.modules`::
from mcpyrate.quotes import macros, q
from mcpyrate.compiler import create_module, run
mymacros = create_module("mymacros")
with q as quoted:
...
run(quoted, mymacros)
with q as quoted:
from mymacros import macros, ...
...
module = run(quoted)
If you're worried about module name collisions, you can use the default
gensymmed name (or gensym your own custom one). But since the module name
in an import statement must be literal, you'll then have to edit the second
code snippet after it was generated (if you generated it via quasiquotes)
to splice in the correct name for the first module to import the macros from::
from mcpyrate.quotes import macros, q
from mcpyrate import gensym
from mcpyrate.compiler import create_module, run
from mcpyrate.utils import rename
modname = gensym("mymacros")
mymacros = create_module(modname)
with q as quoted:
...
run(quoted, mymacros)
with q as quoted:
from _xxx_ import macros, ...
...
rename("_xxx_", modname, quoted)
module = run(quoted)
"""
if not isinstance(source, (str, bytes, ast.Module, list)):
raise TypeError(f"`source` must be Python source code (as `str` or `bytes`), an `ast.Module`, or a `list` of statement AST nodes; got {type(source)} with value {repr(source)}")
dexpander = DialectExpander(filename=filename)
if isinstance(source, (str, bytes)):
if isinstance(source, bytes):
text = importlib.util.decode_source(source) # uses the "coding" prop line like Python itself does
else:
text = source
# dialect source transforms (transpilers, surface syntax extensions, etc.)
text = dexpander.transform_source(text)
# produce initial AST (NOTE: this step produces source location info w.r.t. `text`)
try:
tree = ast.parse(text, filename=filename, mode="exec")
except Exception as err:
raise ImportError(f"Failed to parse {filename} as Python after applying all dialect source transformers.") from err
else: # `ast.Module` or a `list` of statement AST nodes
if isinstance(source, list): # convenience, not provided by built-in `compile`.
tree = ast.Module(body=source, type_ignores=[])
else:
tree = source
if not all(isinstance(x, ast.stmt) for x in tree.body):
invalid_inputs = [x for x in tree.body if not isinstance(x, ast.stmt)]
invalid_inputs_msg = ", ".join(repr(x) for x in invalid_inputs)
raise TypeError(f"module body has one or more elements that are not statement AST nodes: {invalid_inputs_msg}")
# AST transforms: dialects, macros
if not multiphase.ismultiphase(tree):
expansion = singlephase_expand(tree, filename=filename, self_module=self_module, dexpander=dexpander)
else:
if not self_module:
raise ValueError("`self_module` must be specified when multi-phase compiling.")
expansion = multiphase.multiphase_expand(tree, filename=filename, self_module=self_module, dexpander=dexpander, _optimize=optimize)
return expansion
# TODO: Pass through also `flags` and `dont_inherit`? (Need to thread them to the multi-phase compiler, too.
# TODO: This starts to look like a job for `unpythonic.dyn`; should we move it to `mcpyrate`?)
def compile(source, filename, optimize=-1, self_module=None):
"""[mcpyrate] Compile macro-enabled code.
Like the built-in `compile` function, but for macro-enabled code. Supports
macros, dialects, and multi-phase compilation.
Parameters are the same as for `expand`. This function is in fact a thin wrapper
that calls `expand`, and then passes the result to Python's built-in `compile`.
The main reason for its existence is to provide a near drop-in replacement for
the built-in `compile` for macro-enabled input.
Currently the API differs from the built-in `compile` in that:
- `mode` is always `"exec"`,
- `dont_inherit` is always `True`, and
- flags are not supported.
Return value is a code object, ready for `exec`.
**Notes**
If `source` is a *dynamically generated* AST value, it will be unparsed and
re-parsed (before expanding) to autogenerate source location info. This does
mean that `tree` must be able to make that roundtrip (so, e.g., no AST markers).
`source` is considered *dynamically generated* when there is no source file,
i.e. `filename` does not end in `.py`.
The generated location info corresponds to the output of `unparse(source)`
(with no options!), so if a run-time error occurs in that code, then to see
where that error occurred, you can e.g.::
from mcpyrate import unparse
for lineno, code in enumerate(unparse(source).split("\n"), start=1):
print(f"L{lineno:5d} {code}")
"""
code, _ignored_docstring = _compile(source, filename, optimize, self_module)
return code
def _compile(source, filename, optimize, self_module):
# Note `source` can be source code or an AST.
#
# In case of an AST:
#
# If it came directly from a source file, it should already have its source
# location info set correctly when we reach this. However, if it didn't
# come from a file, such as when we `run` a dynamically generated AST value,
# it might not.
#
# (That happens especially if that run-time AST value was generated by
# quasiquotes. Quoted code snippets carry no location info, because that
# is appropriate for their usual use case, in macro output.)
#
# But what is the appropriate source location?
#
# The problem is, Python wasn't designed for the use scenario where an AST
# is assembled from pieces defined in several locations (possibly each in a
# different source file), then passed around, and finally compiled and run
# at a yet another location (possibly in yet another source file).
#
# The AST only has `lineno` and `col_offset`. It has no `filename`, so it
# assumes that an AST comes from a single source file. `types.CodeType` has
# `co_filename` (only one per code object, so it makes the same
# assumption), `co_firstlineno`, and `co_lnotab` (up to Python 3.9) or
# `co_lines` (Python 3.10+).
#
# https://greentreesnakes.readthedocs.io/en/latest/tofrom.html#fix-locations
# https://docs.python.org/3/library/types.html#types.CodeType
# https://docs.python.org/3/library/inspect.html#types-and-members
# https://github.com/python/cpython/blob/master/Objects/lnotab_notes.txt
#
# We could fill in dummy source locations into the AST nodes, like this:
#
# from .astfixers import fix_locations
# fake_lineno = 9999
# fake_col_offset = 9999
# reference_node = ast.Constant(value=None, lineno=fake_lineno, col_offset=fake_col_offset)
# fix_locations(expansion, reference_node, mode="reference")
#
# But it's better to just unparse the input tree, so we get behavior
# similar to when `compile` is called with a source code string as input.
# Source locations will then correspond to the text of that string
# (which the user can easily re-obtain by `unparse(source)`).
if not (filename and filename.endswith(".py")):
if isinstance(source, (ast.AST, list)):
source = unparse(source)
expansion = expand(source, filename=filename, self_module=self_module, optimize=optimize)
assert isinstance(expansion, ast.Module) # we always parse in `"exec"` mode
docstring = getdocstring(expansion.body)
code = builtins.compile(expansion, filename, mode="exec", dont_inherit=True, optimize=optimize)
return code, docstring
def singlephase_expand(tree, *, filename, self_module, dexpander):
"""Expand dialects and macros in `tree`. Single phase only.
This is a low-level function; you likely want `expand` instead. If you
really do need something like this, but have a multi-phase `tree`, use
`mcpyrate.multiphase.multiphase_expand` instead.
Primarily meant to be called with `tree` the AST of a module that
uses macros, but works with any `tree` that has a `body` attribute,
where that `body` is a `list` of statement AST nodes.
`filename`: Full path to the `.py` file being compiled.
`self_module`: Passed in by the multi-phase compiler when it compiles an individual phase
using this function. Used for resolving `__self__` in self-macro-imports
(`from __self__ import macros, ...`).
Ignored in single-phase compilation.
`dexpander`: The `DialectExpander` instance to use for dialect AST transforms.
If not provided, dialect processing is skipped.
Return value is the expanded `tree`.
"""
if dexpander:
tree, dialect_instances = dexpander.transform_ast(tree)
module_macro_bindings = find_macros(tree, filename=filename, self_module=self_module)
expansion = expand_macros(tree, bindings=module_macro_bindings, filename=filename)
if dexpander:
expansion = dexpander.postprocess_ast(expansion, dialect_instances)
check_no_markers_remaining(expansion, filename=filename)
return expansion
# --------------------------------------------------------------------------------
# Convenience functions for compiling and running macro-enabled code snippets at run time.
def run(source, module=None, optimize=-1):
"""Compile and run macro-enabled code at run time.
This behaves, for macro-enabled code, somewhat like the built-in `exec` for
regular code, but instead of a dictionary, we take in an optional module.
The module acts pretty much just as a namespace, though it is a
`types.ModuleType` instance, and does usually live in `sys.modules`.
This unifies the handling of run-time compiled and imported code,
since importing a source file always produces a module.
`source` supports the same formats as in `expand`, plus passthrough
for an already compiled code object that represents a module
(i.e. the output of our `compile`).
If `source` is not yet compiled, and the first statement in it is a static
string (i.e. no f-strings or string arithmetic), it is assigned to the
docstring of the module the code runs in. Otherwise the module docstring
is set to `None`.
The `module` parameter allows to run more code in the context of an
existing module. It can be a dotted name (looked up in `sys.modules`)
or a `types.ModuleType` object (such as returned by this function).
If `module is None`, a new module is created with autogenerated unique
values for `__name__` and `__file__`.
If you need to create a new module, but with a specific dotted name in
`sys.modules` and/or filename, call `create_module` first, and then
pass in the result here as `module`.
If you want to use a temporary module, but always the same one (e.g.
if you run a lot of snippets and worry about adding lots of keys to
`sys.modules`), you can `gensym` a name for it, pass that to
`create_module`, and then pass the resulting module here.
When a new module is created, it is inserted into `sys.modules` **before**
the code runs. If you need to remove it from there later, the key is
`module.__name__`, as usual. (Note that removing it from `sys.modules`
will not remove it from its parent package's namespace, if any. A module
has a parent package if the module's dotted name contains at least one dot.)
Return value is the module, after the code has been `exec`'d in its `__dict__`.
Examples::
from mcpyrate.quotes import macros, q
import copy
from mcpyrate.compiler import run, create_module
from mcpyrate import gensym
with q as quoted:
'''This quoted snippet is considered by `run` as a module.
You can put a module docstring here if you want.
This code can use macros and multi-phase compilation.
To do that, you have to import the macros (and/or enable
the multi-phase compiler) here, at the top level of the
quoted snippet.
'''
x = 21
module = run(quoted) # run in a new module, don't care about name
assert module.x == 21
assert module.__doc__.startswith("This")
with q as quoted:
x = 2 * x
run(quoted, module) # run in the namespace of an existing module
assert module.x == 42
# run in a module with a custom dotted name and filename
mymodule = create_module("mymod", filename="some descriptive string")
with q as quoted:
x = 17
run(quoted, mymodule)
assert mymodule.x == 17
# run in a temporary module, but always use the same one
tempmodule = create_module(gensym("temporary_module"))
for _ in range(10000):
run(quoted, tempmodule)
# how to safely reset a temporary module between runs,
# preserving metadata such as `__name__` and `__file__`.
tempmodule = create_module(gensym("temporary_module"))
metadata = copy.copy(tempmodule.__dict__)
def reset():
tempmodule.__dict__.clear()
tempmodule.__dict__.update(metadata)
for _ in range(10000):
reset()
run(quoted, tempmodule)
"""
if module is not None and not isinstance(module, (ModuleType, str)):
raise TypeError(f"`module` must be a `types.ModuleType`, a dotted name as `str`, or `None`; got {type(module)} with value {repr(module)}")
if module is None:
module = create_module()
elif isinstance(module, str):
dotted_name = module
try:
module = sys.modules[dotted_name]
except KeyError:
err = ModuleNotFoundError(f"Module '{dotted_name}' not found in `sys.modules`")
err.__suppress_context__ = True
raise err
filename = module.__file__
self_module = module.__name__
if isinstance(source, CodeType): # already compiled?
code = source
module.__doc__ = None
else:
code, docstring = _compile(source, filename=filename, self_module=self_module, optimize=optimize)
module.__doc__ = docstring
exec(code, module.__dict__)
return module
def create_module(dotted_name=None, filename=None, *, update_parent=True):
"""Create a new blank module at run time, insert it into `sys.modules`, and return it.
This is a utility function that closely emulates what Python's standard
importer does. It fills in some attributes of the module, and inserts the
new module into `sys.modules`. Used by `run` when no module is given.
However, this does not care whether a module by the given dotted name is already
in `sys.modules`; if so, its entry will be overwritten.
`dotted_name`: Fully qualified name of the module, for `sys.modules`. Optional.
Used as the `__name__` attribute of the module. If not provided,
a unique placeholder name will be auto-generated.
If `dotted_name` has at least one dot in it, the parent package
for the new module must already exist in `sys.modules`. The new
module's `__package__` attribute is set to the dotted name of
the parent.
If `dotted_name` has no dots in it, the new module is a top-level
module; its `__package__` attribute is set to `None`.
`filename`: Full path to the `.py` file the module represents, if applicable.
Otherwise some descriptive string is recommended. Optional.
Used as the `__file__` attribute of the module. If not provided,
a description will be auto-generated (based on `dotted_name` if
that was provided).
`update_parent`: bool, whether to honor Python's package semantics.
This parameter is used only when `dotted_name` has at least
one dot in it.
If `update_parent=True`, the new module is added to its
parent's namespace, like Python's importer would do.
Almost always, this is the right thing to do to achieve
least astonishment.
If `update_parent=False`, the parent module is not touched.
This is occasionally useful, to avoid causing any changes to
program state outside the new module object, while allowing
the execution of code that uses relative imports in the context
of the new module.
An example of when it is the right thing to **not** honor
package semantics can be found in the multi-phase compiler.
It must avoid updating parent modules when compiling a temporary
higher-phase module, so that any existing references in other
modules (to an old but complete version of the module being
compiled) will not be clobbered with ones pointing to the
temporary module (that is incomplete, because the module
being compiled hasn't reached phase 0 yet).
"""
if dotted_name:
if not isinstance(dotted_name, str):
raise TypeError(f"`dotted_name` must be an `str`, got {type(dotted_name)} with value {repr(dotted_name)}")
path = dotted_name.split(".")
if not all(component.isidentifier() for component in path):
raise TypeError(f"each component of `dotted_name` must be a valid identifier`, got {repr(dotted_name)}")
if filename and not isinstance(filename, str):
raise TypeError(f"`filename` must be an `str`, got {type(filename)} with value {repr(filename)}")
uuid = gensym("")
if not filename:
if dotted_name:
filename = f"<dynamically created module '{dotted_name}'>"
else:
filename = f"<dynamically created module {uuid}>"
dotted_name = dotted_name or f"dynamically_created_module_{uuid}"
# Look at the definition of `types.ModuleType` for available attributes.
#
# We always populate `__name__` and `__file__`, and when applicable, `__package__`.
#
# `__loader__` and `__spec__` are left to the default value `None`, because
# those don't make sense for a dynamically created module.
#
# `__doc__` can be filled later (by `run`, if that is used); we don't have the AST yet.
#
# `__dict__` is left at the default value, the empty dictionary. It is filled later,
# when some code is executed in this module.
#
module = ModuleType(dotted_name)
module.__name__ = dotted_name
module.__file__ = filename
# Manage the package abstraction, like the importer does - with the difference that we
# shouldn't import parent packages here. To keep things simple, we only allow creating
# a module with dots in the name if its parent package already exists in `sys.modules`.
if dotted_name.find(".") != -1:
packagename, finalcomponent = dotted_name.rsplit(".", maxsplit=1)
package = sys.modules.get(packagename, None)
if not package:
raise ModuleNotFoundError(f"while dynamically creating module '{dotted_name}': its parent package '{packagename}' not found in `sys.modules`")
module.__package__ = packagename
if update_parent:
# The standard importer adds submodules to the package namespace, so we should too.
# http://python-notes.curiousefficiency.org/en/latest/python_concepts/import_traps.html
setattr(package, finalcomponent, module)
sys.modules[dotted_name] = module
return module