CPython

This document is a crash course on how the Python interpreter CPython (written in C) works. The source code is available here: https://github.com/python/cpython

Objects

In Python, every variable contains a pointer to an object. An object is a struct.

Example of a bigint

Writting the Python code

x = 5

means that internally x is a PyLongObject*, that is a pointer to a PyLongObject which stores data to manage the memory, plus the data 5.

Example of a list

Writting the Python code

L = [5, 2]

means that L is a PyListObject*, i.e. a pointer to a PyListObject.

Example of a tuple

Writting the Python code

L = (5, 2)

means that L is a pointer to a PyTupleObject.

Common data to any data in Python: PyObject

Many structs (like PyLongObject, PyListObject, etc.) actually extends the basic struct PyObject.

typedef struct _object PyObject;

It means that structs (like PyLongObject, PyListObject, etc.) always contain the data for a PyObject, stored in a field called ob_base.

/* PyObject_HEAD defines the initial segment of every PyObject. */
#define PyObject_HEAD                   PyObject ob_base;
struct _object {
    // ob_tid stores the thread id (or zero). It is also used by the GC and the
    // trashcan mechanism as a linked list pointer and by the GC to store the
    // computed "gc_refs" refcount.
    uintptr_t ob_tid;
    uint16_t _padding;
    PyMutex ob_mutex;           // per-object lock
    uint8_t ob_gc_bits;         // gc-related state
    uint32_t ob_ref_local;      // local reference count
    Py_ssize_t ob_ref_shared;   // shared (atomic) reference count
    PyTypeObject *ob_type;
};
  • ob_ref_local is the number of pointers pointing to the current object.

  • The field ob_type contains the type of the current object. ob_type is also an object of type PyTypeObject which can be casted into PyObject.

Implementation of the Python keyword is

https://github.com/python/cpython/blob/main/Include/object.h#L166

// Test if the 'x' object is the 'y' object, the same as "x is y" in Python.
PyAPI_FUNC(int) Py_Is(PyObject *x, PyObject *y);
#define Py_Is(x, y) ((x) == (y))

Big int in Python

Arbitrary big integers are represented by an object of C type PyLongObject:

https://github.com/python/cpython/blob/main/Include/pytypedefs.h#L19

typedef struct _longobject PyLongObject;

https://github.com/python/cpython/blob/main/Include/cpython/longintrepr.h

struct _longobject {
    PyObject_HEAD
    _PyLongValue long_value;
};
  • PyObject_HEAD adds the necesarry fields that are common to any Python object
  • long_value stores the content
typedef struct _PyLongValue {
    uintptr_t lv_tag; /* Number of digits, sign and flags */
    digit ob_digit[1];
} _PyLongValue;
  • uintptr_t est un type entier non signé qui a assez de bits pour représenter un pointeur. Il est défini dans <stdint.h>

  • digit est un type de "chiffre" (généralement entre 0 et $2^{30}-1$)

  • The array ob_digit is of length 1 by default (when the user just write _PyLongValue v) and it can be of any size when the struct is allocated on the heap with malloc (see the creation of such an object here https://github.com/python/cpython/blob/main/Objects/longobject.c#L155)

Reading lv_tag

Getting the sign

static inline bool
_PyLong_IsZero(const PyLongObject *op)
{
    return (op->long_value.lv_tag & SIGN_MASK) == SIGN_ZERO;
}

static inline bool
_PyLong_IsNegative(const PyLongObject *op)
{
    return (op->long_value.lv_tag & SIGN_MASK) == SIGN_NEGATIVE;
}

static inline bool
_PyLong_IsPositive(const PyLongObject *op)
{
    return (op->long_value.lv_tag & SIGN_MASK) == 0;
}

Getting the number of digits

For get the number of digits, we read the field lv_tag appropriately:

https://github.com/python/cpython/blob/main/Include/internal/pycore_long.h#L227

static inline Py_ssize_t
_PyLong_DigitCount(const PyLongObject *op)
{
    assert(PyLong_Check(op));
    return (Py_ssize_t)(op->long_value.lv_tag >> NON_SIZE_BITS);
}

Implementation

https://github.com/python/cpython/blob/main/Objects/longobject.c

Addition

https://github.com/python/cpython/blob/main/Objects/longobject.c#L3750

static PyLongObject *
long_add(PyLongObject *a, PyLongObject *b)
{

    ...

}

Multiplication

https://github.com/python/cpython/blob/main/Objects/longobject.c#L4289

static PyLongObject*
long_mul(PyLongObject *a, PyLongObject *b)
{
    /* fast path for single-digit multiplication */
    if (_PyLong_BothAreCompact(a, b)) {
        stwodigits v = medium_value(a) * medium_value(b);
        return _PyLong_FromSTwoDigits(v);
    }

    PyLongObject *z = k_mul(a, b);
    /* Negate if exactly one of the inputs is negative. */
    if (!_PyLong_SameSign(a, b) && z) {
        _PyLong_Negate(&z);
    }
    return z;
}

Type

Remember the field PyTypeObject *ob_type; of any PyObject object.

https://github.com/python/cpython/blob/main/Doc/includes/typestruct.h#L1

typedef struct _typeobject {
    PyObject_VAR_HEAD
    const char *tp_name; /* For printing, in format "<module>.<name>" */
    Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */

    /* Methods to implement standard operations */

    destructor tp_dealloc;
    Py_ssize_t tp_vectorcall_offset;
    getattrfunc tp_getattr;
    setattrfunc tp_setattr;
    PyAsyncMethods *tp_as_async; /* formerly known as tp_compare (Python 2)
                                    or tp_reserved (Python 3) */
    reprfunc tp_repr;

    /* Method suites for standard classes */

    PyNumberMethods *tp_as_number;
    PySequenceMethods *tp_as_sequence;
    PyMappingMethods *tp_as_mapping;

    /* More standard operations (here for binary compatibility) */

    hashfunc tp_hash;
    ternaryfunc tp_call;
    reprfunc tp_str;
    getattrofunc tp_getattro;
    setattrofunc tp_setattro;

    /* Functions to access object as input/output buffer */
    PyBufferProcs *tp_as_buffer;

    /* Flags to define presence of optional/expanded features */
    unsigned long tp_flags;

    const char *tp_doc; /* Documentation string */

    /* Assigned meaning in release 2.0 */
    /* call function for all accessible objects */
    traverseproc tp_traverse;

    /* delete references to contained objects */
    inquiry tp_clear;

    /* Assigned meaning in release 2.1 */
    /* rich comparisons */
    richcmpfunc tp_richcompare;

    /* weak reference enabler */
    Py_ssize_t tp_weaklistoffset;

    /* Iterators */
    getiterfunc tp_iter;
    iternextfunc tp_iternext;

    /* Attribute descriptor and subclassing stuff */
    struct PyMethodDef *tp_methods;
    struct PyMemberDef *tp_members;
    struct PyGetSetDef *tp_getset;
    // Strong reference on a heap type, borrowed reference on a static type
    struct _typeobject *tp_base;
    PyObject *tp_dict;
    descrgetfunc tp_descr_get;
    descrsetfunc tp_descr_set;
    Py_ssize_t tp_dictoffset;
    initproc tp_init;
    allocfunc tp_alloc;
    newfunc tp_new;
    freefunc tp_free; /* Low-level free-memory routine */
    inquiry tp_is_gc; /* For PyObject_IS_GC */
    PyObject *tp_bases;
    PyObject *tp_mro; /* method resolution order */
    PyObject *tp_cache;
    PyObject *tp_subclasses;
    PyObject *tp_weaklist;
    destructor tp_del;

    /* Type attribute cache version tag. Added in version 2.6 */
    unsigned int tp_version_tag;

    destructor tp_finalize;
    vectorcallfunc tp_vectorcall;

    /* bitset of which type-watchers care about this type */
    unsigned char tp_watched;
} PyTypeObject;

And here is the type of any object type: PyType_Type. It means that if t is a type, then t.ob_type == &PyType_Type.

https://github.com/python/cpython/blob/main/Objects/typeobject.c#L6294C1-L6339C3

PyTypeObject PyType_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "type",                                     /* tp_name */
    sizeof(PyHeapTypeObject),                   /* tp_basicsize */
    sizeof(PyMemberDef),                        /* tp_itemsize */
    type_dealloc,                               /* tp_dealloc */
    offsetof(PyTypeObject, tp_vectorcall),      /* tp_vectorcall_offset */
    0,                                          /* tp_getattr */
    0,                                          /* tp_setattr */
    0,                                          /* tp_as_async */
    type_repr,                                  /* tp_repr */
    &type_as_number,                            /* tp_as_number */
    0,                                          /* tp_as_sequence */
    0,                                          /* tp_as_mapping */
    0,                                          /* tp_hash */
    type_call,                                  /* tp_call */
    0,                                          /* tp_str */
    _Py_type_getattro,                          /* tp_getattro */
    type_setattro,                              /* tp_setattro */
    0,                                          /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
    Py_TPFLAGS_BASETYPE | Py_TPFLAGS_TYPE_SUBCLASS |
    Py_TPFLAGS_HAVE_VECTORCALL |
    Py_TPFLAGS_ITEMS_AT_END,                    /* tp_flags */
    type_doc,                                   /* tp_doc */
    type_traverse,                              /* tp_traverse */
    type_clear,                                 /* tp_clear */
    0,                                          /* tp_richcompare */
    offsetof(PyTypeObject, tp_weaklist),        /* tp_weaklistoffset */
    0,                                          /* tp_iter */
    0,                                          /* tp_iternext */
    type_methods,                               /* tp_methods */
    type_members,                               /* tp_members */
    type_getsets,                               /* tp_getset */
    0,                                          /* tp_base */
    0,                                          /* tp_dict */
    0,                                          /* tp_descr_get */
    0,                                          /* tp_descr_set */
    offsetof(PyTypeObject, tp_dict),            /* tp_dictoffset */
    type_init,                                  /* tp_init */
    0,                                          /* tp_alloc */
    type_new,                                   /* tp_new */
    PyObject_GC_Del,                            /* tp_free */
    type_is_gc,                                 /* tp_is_gc */
    .tp_vectorcall = type_vectorcall,
};

Lists

https://github.com/python/cpython/blob/main/Include/cpython/listobject.h#L22

typedef struct {
    PyObject_VAR_HEAD
    /* Vector of pointers to list elements.  list[0] is ob_item[0], etc. */
    PyObject **ob_item;

    /* ob_item contains space for 'allocated' elements.  The number
     * currently in use is ob_size.
     * Invariants:
     *     0 <= ob_size <= allocated
     *     len(list) == ob_size
     *     ob_item == NULL implies ob_size == allocated == 0
     * list.sort() temporarily sets allocated to -1 to detect mutations.
     *
     * Items must normally not be NULL, except during construction when
     * the list is not yet visible outside the function that builds it.
     */
    Py_ssize_t allocated;
} PyListObject;

Tuples

typedef struct {
    PyObject_VAR_HEAD
    /* ob_item contains space for 'ob_size' elements.
       Items must normally not be NULL, except during construction when
       the tuple is not yet visible outside the function that builds it. */
    PyObject *ob_item[1];
} PyTupleObject;