CPython
This document is a crash course on how the Python interpreter CPython (written in C) works. The source code is available here: https://github.com/python/cpython
Objects
In Python, every variable contains a pointer to an object. An object is a struct.
Example of a bigint
Writting the Python code
x = 5
means that internally x
is a PyLongObject*
, that is a pointer to a PyLongObject
which stores data to manage the memory, plus the data 5.
Example of a list
Writting the Python code
L = [5, 2]
means that L
is a PyListObject*
, i.e. a pointer to a PyListObject
.
Example of a tuple
Writting the Python code
L = (5, 2)
means that L
is a pointer to a PyTupleObject
.
Common data to any data in Python: PyObject
Many structs (like PyLongObject
, PyListObject
, etc.) actually extends the basic struct PyObject
.
typedef struct _object PyObject;
It means that structs (like PyLongObject
, PyListObject
, etc.) always contain the data for a PyObject, stored in a field called ob_base
.
/* PyObject_HEAD defines the initial segment of every PyObject. */
#define PyObject_HEAD PyObject ob_base;
struct _object {
// ob_tid stores the thread id (or zero). It is also used by the GC and the
// trashcan mechanism as a linked list pointer and by the GC to store the
// computed "gc_refs" refcount.
uintptr_t ob_tid;
uint16_t _padding;
PyMutex ob_mutex; // per-object lock
uint8_t ob_gc_bits; // gc-related state
uint32_t ob_ref_local; // local reference count
Py_ssize_t ob_ref_shared; // shared (atomic) reference count
PyTypeObject *ob_type;
};
ob_ref_local
is the number of pointers pointing to the current object.
- The field
ob_type
contains the type of the current object.ob_type
is also an object of typePyTypeObject
which can be casted intoPyObject
.
Implementation of the Python keyword is
https://github.com/python/cpython/blob/main/Include/object.h#L166
// Test if the 'x' object is the 'y' object, the same as "x is y" in Python.
PyAPI_FUNC(int) Py_Is(PyObject *x, PyObject *y);
#define Py_Is(x, y) ((x) == (y))
Big int in Python
Arbitrary big integers are represented by an object of C type PyLongObject
:
https://github.com/python/cpython/blob/main/Include/pytypedefs.h#L19
typedef struct _longobject PyLongObject;
https://github.com/python/cpython/blob/main/Include/cpython/longintrepr.h
struct _longobject {
PyObject_HEAD
_PyLongValue long_value;
};
PyObject_HEAD
adds the necesarry fields that are common to any Python object- long_value stores the content
typedef struct _PyLongValue {
uintptr_t lv_tag; /* Number of digits, sign and flags */
digit ob_digit[1];
} _PyLongValue;
-
uintptr_t
est un type entier non signé qui a assez de bits pour représenter un pointeur. Il est défini dans<stdint.h>
-
digit
est un type de "chiffre" (généralement entre 0 et $2^{30}-1$) -
The array
ob_digit
is of length 1 by default (when the user just write_PyLongValue v
) and it can be of any size when the struct is allocated on the heap withmalloc
(see the creation of such an object here https://github.com/python/cpython/blob/main/Objects/longobject.c#L155)
Reading lv_tag
Getting the sign
static inline bool
_PyLong_IsZero(const PyLongObject *op)
{
return (op->long_value.lv_tag & SIGN_MASK) == SIGN_ZERO;
}
static inline bool
_PyLong_IsNegative(const PyLongObject *op)
{
return (op->long_value.lv_tag & SIGN_MASK) == SIGN_NEGATIVE;
}
static inline bool
_PyLong_IsPositive(const PyLongObject *op)
{
return (op->long_value.lv_tag & SIGN_MASK) == 0;
}
Getting the number of digits
For get the number of digits, we read the field lv_tag
appropriately:
https://github.com/python/cpython/blob/main/Include/internal/pycore_long.h#L227
static inline Py_ssize_t
_PyLong_DigitCount(const PyLongObject *op)
{
assert(PyLong_Check(op));
return (Py_ssize_t)(op->long_value.lv_tag >> NON_SIZE_BITS);
}
Implementation
https://github.com/python/cpython/blob/main/Objects/longobject.c
Addition
https://github.com/python/cpython/blob/main/Objects/longobject.c#L3750
static PyLongObject *
long_add(PyLongObject *a, PyLongObject *b)
{
...
}
Multiplication
https://github.com/python/cpython/blob/main/Objects/longobject.c#L4289
static PyLongObject*
long_mul(PyLongObject *a, PyLongObject *b)
{
/* fast path for single-digit multiplication */
if (_PyLong_BothAreCompact(a, b)) {
stwodigits v = medium_value(a) * medium_value(b);
return _PyLong_FromSTwoDigits(v);
}
PyLongObject *z = k_mul(a, b);
/* Negate if exactly one of the inputs is negative. */
if (!_PyLong_SameSign(a, b) && z) {
_PyLong_Negate(&z);
}
return z;
}
Type
Remember the field PyTypeObject *ob_type;
of any PyObject
object.
https://github.com/python/cpython/blob/main/Doc/includes/typestruct.h#L1
typedef struct _typeobject {
PyObject_VAR_HEAD
const char *tp_name; /* For printing, in format "<module>.<name>" */
Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */
/* Methods to implement standard operations */
destructor tp_dealloc;
Py_ssize_t tp_vectorcall_offset;
getattrfunc tp_getattr;
setattrfunc tp_setattr;
PyAsyncMethods *tp_as_async; /* formerly known as tp_compare (Python 2)
or tp_reserved (Python 3) */
reprfunc tp_repr;
/* Method suites for standard classes */
PyNumberMethods *tp_as_number;
PySequenceMethods *tp_as_sequence;
PyMappingMethods *tp_as_mapping;
/* More standard operations (here for binary compatibility) */
hashfunc tp_hash;
ternaryfunc tp_call;
reprfunc tp_str;
getattrofunc tp_getattro;
setattrofunc tp_setattro;
/* Functions to access object as input/output buffer */
PyBufferProcs *tp_as_buffer;
/* Flags to define presence of optional/expanded features */
unsigned long tp_flags;
const char *tp_doc; /* Documentation string */
/* Assigned meaning in release 2.0 */
/* call function for all accessible objects */
traverseproc tp_traverse;
/* delete references to contained objects */
inquiry tp_clear;
/* Assigned meaning in release 2.1 */
/* rich comparisons */
richcmpfunc tp_richcompare;
/* weak reference enabler */
Py_ssize_t tp_weaklistoffset;
/* Iterators */
getiterfunc tp_iter;
iternextfunc tp_iternext;
/* Attribute descriptor and subclassing stuff */
struct PyMethodDef *tp_methods;
struct PyMemberDef *tp_members;
struct PyGetSetDef *tp_getset;
// Strong reference on a heap type, borrowed reference on a static type
struct _typeobject *tp_base;
PyObject *tp_dict;
descrgetfunc tp_descr_get;
descrsetfunc tp_descr_set;
Py_ssize_t tp_dictoffset;
initproc tp_init;
allocfunc tp_alloc;
newfunc tp_new;
freefunc tp_free; /* Low-level free-memory routine */
inquiry tp_is_gc; /* For PyObject_IS_GC */
PyObject *tp_bases;
PyObject *tp_mro; /* method resolution order */
PyObject *tp_cache;
PyObject *tp_subclasses;
PyObject *tp_weaklist;
destructor tp_del;
/* Type attribute cache version tag. Added in version 2.6 */
unsigned int tp_version_tag;
destructor tp_finalize;
vectorcallfunc tp_vectorcall;
/* bitset of which type-watchers care about this type */
unsigned char tp_watched;
} PyTypeObject;
And here is the type of any object type: PyType_Type
. It means that if t
is a type, then t.ob_type == &PyType_Type
.
https://github.com/python/cpython/blob/main/Objects/typeobject.c#L6294C1-L6339C3
PyTypeObject PyType_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"type", /* tp_name */
sizeof(PyHeapTypeObject), /* tp_basicsize */
sizeof(PyMemberDef), /* tp_itemsize */
type_dealloc, /* tp_dealloc */
offsetof(PyTypeObject, tp_vectorcall), /* tp_vectorcall_offset */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_as_async */
type_repr, /* tp_repr */
&type_as_number, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
type_call, /* tp_call */
0, /* tp_str */
_Py_type_getattro, /* tp_getattro */
type_setattro, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
Py_TPFLAGS_BASETYPE | Py_TPFLAGS_TYPE_SUBCLASS |
Py_TPFLAGS_HAVE_VECTORCALL |
Py_TPFLAGS_ITEMS_AT_END, /* tp_flags */
type_doc, /* tp_doc */
type_traverse, /* tp_traverse */
type_clear, /* tp_clear */
0, /* tp_richcompare */
offsetof(PyTypeObject, tp_weaklist), /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
type_methods, /* tp_methods */
type_members, /* tp_members */
type_getsets, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
offsetof(PyTypeObject, tp_dict), /* tp_dictoffset */
type_init, /* tp_init */
0, /* tp_alloc */
type_new, /* tp_new */
PyObject_GC_Del, /* tp_free */
type_is_gc, /* tp_is_gc */
.tp_vectorcall = type_vectorcall,
};
Lists
https://github.com/python/cpython/blob/main/Include/cpython/listobject.h#L22
typedef struct {
PyObject_VAR_HEAD
/* Vector of pointers to list elements. list[0] is ob_item[0], etc. */
PyObject **ob_item;
/* ob_item contains space for 'allocated' elements. The number
* currently in use is ob_size.
* Invariants:
* 0 <= ob_size <= allocated
* len(list) == ob_size
* ob_item == NULL implies ob_size == allocated == 0
* list.sort() temporarily sets allocated to -1 to detect mutations.
*
* Items must normally not be NULL, except during construction when
* the list is not yet visible outside the function that builds it.
*/
Py_ssize_t allocated;
} PyListObject;
Tuples
typedef struct {
PyObject_VAR_HEAD
/* ob_item contains space for 'ob_size' elements.
Items must normally not be NULL, except during construction when
the tuple is not yet visible outside the function that builds it. */
PyObject *ob_item[1];
} PyTupleObject;