Skip to content

Commit ef658dc

Browse files
committed
speed up urlize matching
1 parent eeca0fe commit ef658dc

File tree

2 files changed

+66
-51
lines changed

2 files changed

+66
-51
lines changed

CHANGES.rst

+10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
.. currentmodule:: jinja2
22

3+
Version 2.11.3
4+
--------------
5+
6+
Unreleased
7+
8+
- Improve the speed of the ``urlize`` filter by reducing regex
9+
backtracking. Email matching requires a word character at the start
10+
of the domain part, and only word characters in the TLD. :pr:`1343`
11+
12+
313
Version 2.11.2
414
--------------
515

src/jinja2/utils.py

+56-51
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from collections import deque
77
from random import choice
88
from random import randrange
9+
from string import ascii_letters as _letters
10+
from string import digits as _digits
911
from threading import Lock
1012

1113
from markupsafe import escape
@@ -16,20 +18,6 @@
1618
from ._compat import text_type
1719
from ._compat import url_quote
1820

19-
_word_split_re = re.compile(r"(\s+)")
20-
_punctuation_re = re.compile(
21-
"^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$"
22-
% (
23-
"|".join(map(re.escape, ("(", "<", "&lt;"))),
24-
"|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;"))),
25-
)
26-
)
27-
_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
28-
_striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
29-
_entity_re = re.compile(r"&([^;]+);")
30-
_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
31-
_digits = "0123456789"
32-
3321
# special singleton representing missing values for the runtime
3422
missing = type("MissingType", (), {"__repr__": lambda x: "missing"})()
3523

@@ -210,48 +198,65 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
210198
and (x[:limit] + (len(x) >= limit and "..." or ""))
211199
or x
212200
)
213-
words = _word_split_re.split(text_type(escape(text)))
201+
words = re.split(r"(\s+)", text_type(escape(text)))
214202
rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or ""
215203
target_attr = target and ' target="%s"' % escape(target) or ""
216204

217205
for i, word in enumerate(words):
218-
match = _punctuation_re.match(word)
206+
head, middle, tail = "", word, ""
207+
match = re.match(r"^([(<]|&lt;)+", middle)
208+
219209
if match:
220-
lead, middle, trail = match.groups()
221-
if middle.startswith("www.") or (
222-
"@" not in middle
223-
and not middle.startswith("http://")
224-
and not middle.startswith("https://")
225-
and len(middle) > 0
226-
and middle[0] in _letters + _digits
227-
and (
228-
middle.endswith(".org")
229-
or middle.endswith(".net")
230-
or middle.endswith(".com")
231-
)
232-
):
233-
middle = '<a href="http://%s"%s%s>%s</a>' % (
234-
middle,
235-
rel_attr,
236-
target_attr,
237-
trim_url(middle),
238-
)
239-
if middle.startswith("http://") or middle.startswith("https://"):
240-
middle = '<a href="%s"%s%s>%s</a>' % (
241-
middle,
242-
rel_attr,
243-
target_attr,
244-
trim_url(middle),
245-
)
246-
if (
247-
"@" in middle
248-
and not middle.startswith("www.")
249-
and ":" not in middle
250-
and _simple_email_re.match(middle)
251-
):
252-
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
253-
if lead + middle + trail != word:
254-
words[i] = lead + middle + trail
210+
head = match.group()
211+
middle = middle[match.end() :]
212+
213+
# Unlike lead, which is anchored to the start of the string,
214+
# need to check that the string ends with any of the characters
215+
# before trying to match all of them, to avoid backtracking.
216+
if middle.endswith((")", ">", ".", ",", "\n", "&gt;")):
217+
match = re.search(r"([)>.,\n]|&gt;)+$", middle)
218+
219+
if match:
220+
tail = match.group()
221+
middle = middle[: match.start()]
222+
223+
if middle.startswith("www.") or (
224+
"@" not in middle
225+
and not middle.startswith("http://")
226+
and not middle.startswith("https://")
227+
and len(middle) > 0
228+
and middle[0] in _letters + _digits
229+
and (
230+
middle.endswith(".org")
231+
or middle.endswith(".net")
232+
or middle.endswith(".com")
233+
)
234+
):
235+
middle = '<a href="http://%s"%s%s>%s</a>' % (
236+
middle,
237+
rel_attr,
238+
target_attr,
239+
trim_url(middle),
240+
)
241+
242+
if middle.startswith("http://") or middle.startswith("https://"):
243+
middle = '<a href="%s"%s%s>%s</a>' % (
244+
middle,
245+
rel_attr,
246+
target_attr,
247+
trim_url(middle),
248+
)
249+
250+
if (
251+
"@" in middle
252+
and not middle.startswith("www.")
253+
and ":" not in middle
254+
and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle)
255+
):
256+
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
257+
258+
words[i] = head + middle + tail
259+
255260
return u"".join(words)
256261

257262

0 commit comments

Comments
 (0)