Skip to content

Commit 2566b74

Browse files
methanemerwok
andauthored
gh-81283: compiler: remove indent from docstring (#106411)
Co-authored-by: Éric <merwok@netwok.org>
1 parent bbf6297 commit 2566b74

File tree

9 files changed

+246
-30
lines changed

9 files changed

+246
-30
lines changed

Doc/whatsnew/3.13.rst

+7
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,13 @@ Other Language Changes
7979
* Allow the *count* argument of :meth:`str.replace` to be a keyword.
8080
(Contributed by Hugo van Kemenade in :gh:`106487`.)
8181

82+
* Compiler now strip indents from docstrings.
83+
This will reduce the size of :term:`bytecode cache <bytecode>` (e.g. ``.pyc`` file).
84+
For example, cache file size for ``sqlalchemy.orm.session`` in SQLAlchemy 2.0
85+
is reduced by about 5%.
86+
This change will affect tools using docstrings, like :mod:`doctest`.
87+
(Contributed by Inada Naoki in :gh:`81283`.)
88+
8289
New Modules
8390
===========
8491

Include/internal/pycore_compile.h

+2
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ int _PyCompile_ConstCacheMergeOne(PyObject *const_cache, PyObject **obj);
9191

9292
/* Access compiler internals for unit testing */
9393

94+
PyAPI_FUNC(PyObject*) _PyCompile_CleanDoc(PyObject *doc);
95+
9496
PyAPI_FUNC(PyObject*) _PyCompile_CodeGen(
9597
PyObject *ast,
9698
PyObject *filename,

Lib/inspect.py

+22-23
Original file line numberDiff line numberDiff line change
@@ -881,29 +881,28 @@ def cleandoc(doc):
881881
882882
Any whitespace that can be uniformly removed from the second line
883883
onwards is removed."""
884-
try:
885-
lines = doc.expandtabs().split('\n')
886-
except UnicodeError:
887-
return None
888-
else:
889-
# Find minimum indentation of any non-blank lines after first line.
890-
margin = sys.maxsize
891-
for line in lines[1:]:
892-
content = len(line.lstrip())
893-
if content:
894-
indent = len(line) - content
895-
margin = min(margin, indent)
896-
# Remove indentation.
897-
if lines:
898-
lines[0] = lines[0].lstrip()
899-
if margin < sys.maxsize:
900-
for i in range(1, len(lines)): lines[i] = lines[i][margin:]
901-
# Remove any trailing or leading blank lines.
902-
while lines and not lines[-1]:
903-
lines.pop()
904-
while lines and not lines[0]:
905-
lines.pop(0)
906-
return '\n'.join(lines)
884+
lines = doc.expandtabs().split('\n')
885+
886+
# Find minimum indentation of any non-blank lines after first line.
887+
margin = sys.maxsize
888+
for line in lines[1:]:
889+
content = len(line.lstrip(' '))
890+
if content:
891+
indent = len(line) - content
892+
margin = min(margin, indent)
893+
# Remove indentation.
894+
if lines:
895+
lines[0] = lines[0].lstrip(' ')
896+
if margin < sys.maxsize:
897+
for i in range(1, len(lines)):
898+
lines[i] = lines[i][margin:]
899+
# Remove any trailing or leading blank lines.
900+
while lines and not lines[-1]:
901+
lines.pop()
902+
while lines and not lines[0]:
903+
lines.pop(0)
904+
return '\n'.join(lines)
905+
907906

908907
def getfile(object):
909908
"""Work out which source or compiled file an object was defined in."""

Lib/test/test_doctest.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1287,14 +1287,14 @@ def optionflags(): r"""
12871287
treated as equal:
12881288
12891289
>>> def f(x):
1290-
... '>>> print(1, 2, 3)\n 1 2\n 3'
1290+
... '\n>>> print(1, 2, 3)\n 1 2\n 3'
12911291
12921292
>>> # Without the flag:
12931293
>>> test = doctest.DocTestFinder().find(f)[0]
12941294
>>> doctest.DocTestRunner(verbose=False).run(test)
12951295
... # doctest: +ELLIPSIS
12961296
**********************************************************************
1297-
File ..., line 2, in f
1297+
File ..., line 3, in f
12981298
Failed example:
12991299
print(1, 2, 3)
13001300
Expected:

Lib/test/test_inspect.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -596,9 +596,40 @@ def test_finddoc(self):
596596
self.assertEqual(finddoc(int.from_bytes), int.from_bytes.__doc__)
597597
self.assertEqual(finddoc(int.real), int.real.__doc__)
598598

599+
cleandoc_testdata = [
600+
# first line should have different margin
601+
(' An\n indented\n docstring.', 'An\nindented\n docstring.'),
602+
# trailing whitespace are not removed.
603+
(' An \n \n indented \n docstring. ',
604+
'An \n \nindented \n docstring. '),
605+
# NUL is not termination.
606+
('doc\0string\n\n second\0line\n third\0line\0',
607+
'doc\0string\n\nsecond\0line\nthird\0line\0'),
608+
# first line is lstrip()-ped. other lines are kept when no margin.[w:
609+
(' ', ''),
610+
# compiler.cleandoc() doesn't strip leading/trailing newlines
611+
# to keep maximum backward compatibility.
612+
# inspect.cleandoc() removes them.
613+
('\n\n\n first paragraph\n\n second paragraph\n\n',
614+
'\n\n\nfirst paragraph\n\n second paragraph\n\n'),
615+
(' \n \n \n ', '\n \n \n '),
616+
]
617+
599618
def test_cleandoc(self):
600-
self.assertEqual(inspect.cleandoc('An\n indented\n docstring.'),
601-
'An\nindented\ndocstring.')
619+
func = inspect.cleandoc
620+
for i, (input, expected) in enumerate(self.cleandoc_testdata):
621+
# only inspect.cleandoc() strip \n
622+
expected = expected.strip('\n')
623+
with self.subTest(i=i):
624+
self.assertEqual(func(input), expected)
625+
626+
@cpython_only
627+
def test_c_cleandoc(self):
628+
import _testinternalcapi
629+
func = _testinternalcapi.compiler_cleandoc
630+
for i, (input, expected) in enumerate(self.cleandoc_testdata):
631+
with self.subTest(i=i):
632+
self.assertEqual(func(input), expected)
602633

603634
def test_getcomments(self):
604635
self.assertEqual(inspect.getcomments(mod), '# line 1\n')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Compiler now strips indents from docstrings. It reduces ``pyc`` file size 5%
2+
when the module is heavily documented. This change affects to ``__doc__`` so
3+
tools like doctest will be affected.

Modules/_testinternalcapi.c

+19-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#include "pycore_atomic_funcs.h" // _Py_atomic_int_get()
1616
#include "pycore_bitutils.h" // _Py_bswap32()
1717
#include "pycore_bytesobject.h" // _PyBytes_Find()
18-
#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble
18+
#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble, _PyCompile_CleanDoc
1919
#include "pycore_ceval.h" // _PyEval_AddPendingCall
2020
#include "pycore_fileutils.h" // _Py_normpath
2121
#include "pycore_frame.h" // _PyInterpreterFrame
@@ -704,6 +704,23 @@ set_eval_frame_record(PyObject *self, PyObject *list)
704704
Py_RETURN_NONE;
705705
}
706706

707+
/*[clinic input]
708+
709+
_testinternalcapi.compiler_cleandoc -> object
710+
711+
doc: unicode
712+
713+
C implementation of inspect.cleandoc().
714+
[clinic start generated code]*/
715+
716+
static PyObject *
717+
_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc)
718+
/*[clinic end generated code: output=2dd203a80feff5bc input=2de03fab931d9cdc]*/
719+
{
720+
return _PyCompile_CleanDoc(doc);
721+
}
722+
723+
707724
/*[clinic input]
708725
709726
_testinternalcapi.compiler_codegen -> object
@@ -1448,6 +1465,7 @@ static PyMethodDef module_functions[] = {
14481465
{"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
14491466
{"set_eval_frame_default", set_eval_frame_default, METH_NOARGS, NULL},
14501467
{"set_eval_frame_record", set_eval_frame_record, METH_O, NULL},
1468+
_TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF
14511469
_TESTINTERNALCAPI_COMPILER_CODEGEN_METHODDEF
14521470
_TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
14531471
_TESTINTERNALCAPI_ASSEMBLE_CODE_OBJECT_METHODDEF

Modules/clinic/_testinternalcapi.c.h

+60-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/compile.c

+98-1
Original file line numberDiff line numberDiff line change
@@ -1704,10 +1704,16 @@ compiler_body(struct compiler *c, location loc, asdl_stmt_seq *stmts)
17041704
if (c->c_optimize < 2) {
17051705
docstring = _PyAST_GetDocString(stmts);
17061706
if (docstring) {
1707+
PyObject *cleandoc = _PyCompile_CleanDoc(docstring);
1708+
if (cleandoc == NULL) {
1709+
return ERROR;
1710+
}
17071711
i = 1;
17081712
st = (stmt_ty)asdl_seq_GET(stmts, 0);
17091713
assert(st->kind == Expr_kind);
1710-
VISIT(c, expr, st->v.Expr.value);
1714+
location loc = LOC(st->v.Expr.value);
1715+
ADDOP_LOAD_CONST(c, loc, cleandoc);
1716+
Py_DECREF(cleandoc);
17111717
RETURN_IF_ERROR(compiler_nameop(c, NO_LOCATION, &_Py_ID(__doc__), Store));
17121718
}
17131719
}
@@ -2252,11 +2258,19 @@ compiler_function_body(struct compiler *c, stmt_ty s, int is_async, Py_ssize_t f
22522258
/* if not -OO mode, add docstring */
22532259
if (c->c_optimize < 2) {
22542260
docstring = _PyAST_GetDocString(body);
2261+
if (docstring) {
2262+
docstring = _PyCompile_CleanDoc(docstring);
2263+
if (docstring == NULL) {
2264+
compiler_exit_scope(c);
2265+
return ERROR;
2266+
}
2267+
}
22552268
}
22562269
if (compiler_add_const(c->c_const_cache, c->u, docstring ? docstring : Py_None) < 0) {
22572270
compiler_exit_scope(c);
22582271
return ERROR;
22592272
}
2273+
Py_XDECREF(docstring);
22602274

22612275
c->u->u_metadata.u_argcount = asdl_seq_LEN(args->args);
22622276
c->u->u_metadata.u_posonlyargcount = asdl_seq_LEN(args->posonlyargs);
@@ -7967,6 +7981,89 @@ cfg_to_instructions(cfg_builder *g)
79677981
return NULL;
79687982
}
79697983

7984+
// C implementation of inspect.cleandoc()
7985+
//
7986+
// Difference from inspect.cleandoc():
7987+
// - Do not remove leading and trailing blank lines to keep lineno.
7988+
PyObject *
7989+
_PyCompile_CleanDoc(PyObject *doc)
7990+
{
7991+
doc = PyObject_CallMethod(doc, "expandtabs", NULL);
7992+
if (doc == NULL) {
7993+
return NULL;
7994+
}
7995+
7996+
Py_ssize_t doc_size;
7997+
const char *doc_utf8 = PyUnicode_AsUTF8AndSize(doc, &doc_size);
7998+
if (doc_utf8 == NULL) {
7999+
Py_DECREF(doc);
8000+
return NULL;
8001+
}
8002+
const char *p = doc_utf8;
8003+
const char *pend = p + doc_size;
8004+
8005+
// First pass: find minimum indentation of any non-blank lines
8006+
// after first line.
8007+
while (p < pend && *p++ != '\n') {
8008+
}
8009+
8010+
Py_ssize_t margin = PY_SSIZE_T_MAX;
8011+
while (p < pend) {
8012+
const char *s = p;
8013+
while (*p == ' ') p++;
8014+
if (p < pend && *p != '\n') {
8015+
margin = Py_MIN(margin, p - s);
8016+
}
8017+
while (p < pend && *p++ != '\n') {
8018+
}
8019+
}
8020+
if (margin == PY_SSIZE_T_MAX) {
8021+
margin = 0;
8022+
}
8023+
8024+
// Second pass: write cleandoc into buff.
8025+
8026+
// copy first line without leading spaces.
8027+
p = doc_utf8;
8028+
while (*p == ' ') {
8029+
p++;
8030+
}
8031+
if (p == doc_utf8 && margin == 0 ) {
8032+
// doc is already clean.
8033+
return doc;
8034+
}
8035+
8036+
char *buff = PyMem_Malloc(doc_size);
8037+
char *w = buff;
8038+
8039+
while (p < pend) {
8040+
int ch = *w++ = *p++;
8041+
if (ch == '\n') {
8042+
break;
8043+
}
8044+
}
8045+
8046+
// copy subsequent lines without margin.
8047+
while (p < pend) {
8048+
for (Py_ssize_t i = 0; i < margin; i++, p++) {
8049+
if (*p != ' ') {
8050+
assert(*p == '\n' || *p == '\0');
8051+
break;
8052+
}
8053+
}
8054+
while (p < pend) {
8055+
int ch = *w++ = *p++;
8056+
if (ch == '\n') {
8057+
break;
8058+
}
8059+
}
8060+
}
8061+
8062+
Py_DECREF(doc);
8063+
return PyUnicode_FromStringAndSize(buff, w - buff);
8064+
}
8065+
8066+
79708067
PyObject *
79718068
_PyCompile_CodeGen(PyObject *ast, PyObject *filename, PyCompilerFlags *pflags,
79728069
int optimize, int compile_mode)

0 commit comments

Comments
 (0)