Skip to content

Commit e4d2fb2

Browse files
authored
[3.12] gh-112943: Correctly compute end offsets for multiline tokens in the tokenize module (GH-112949) (#112957)
(cherry picked from commit a135a6d)
1 parent f3933d4 commit e4d2fb2

File tree

5 files changed

+25
-6
lines changed

5 files changed

+25
-6
lines changed

Lib/test/test_tokenize.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,16 @@ def test_string(self):
620620
OP '}' (3, 0) (3, 1)
621621
FSTRING_MIDDLE '__' (3, 1) (3, 3)
622622
FSTRING_END "'" (3, 3) (3, 4)
623+
""")
624+
625+
self.check_tokenize("""\
626+
'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli
627+
aktualni pracownicy, obecni pracownicy'''
628+
""", """\
629+
INDENT ' ' (1, 0) (1, 4)
630+
STRING "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45)
631+
NEWLINE '\\n' (2, 45) (2, 46)
632+
DEDENT '' (3, 0) (3, 0)
623633
""")
624634

625635
def test_function(self):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Correctly compute end column offsets for multiline tokens in the
2+
:mod:`tokenize` module. Patch by Pablo Galindo

Parser/pegen.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@ _PyPegen_interactive_exit(Parser *p)
1818
}
1919

2020
Py_ssize_t
21-
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
21+
_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
2222
{
23-
const char *str = PyUnicode_AsUTF8(line);
24-
if (!str) {
25-
return -1;
26-
}
2723
Py_ssize_t len = strlen(str);
2824
if (col_offset > len + 1) {
2925
col_offset = len + 1;
@@ -93,6 +89,16 @@ _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
9389
return width;
9490
}
9591

92+
Py_ssize_t
93+
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
94+
{
95+
const char *str = PyUnicode_AsUTF8(line);
96+
if (!str) {
97+
return -1;
98+
}
99+
return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
100+
}
101+
96102
// Here, mark is the start of the node, while p->mark is the end.
97103
// If node==NULL, they should be the same.
98104
int

Parser/pegen.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ expr_ty _PyPegen_name_token(Parser *p);
151151
expr_ty _PyPegen_number_token(Parser *p);
152152
void *_PyPegen_string_token(Parser *p);
153153
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
154+
Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);
154155
Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
155156

156157
// Error handling functions and APIs

Python/Python-tokenize.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ tokenizeriter_next(tokenizeriterobject *it)
224224
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
225225
}
226226
if (token.end != NULL && token.end >= it->tok->line_start) {
227-
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
227+
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
228228
}
229229

230230
if (it->tok->tok_extra_tokens) {

0 commit comments

Comments
 (0)