|
6 | 6 | from collections import deque
|
7 | 7 | from random import choice
|
8 | 8 | from random import randrange
|
| 9 | +from string import ascii_letters as _letters |
| 10 | +from string import digits as _digits |
9 | 11 | from threading import Lock
|
10 | 12 |
|
11 | 13 | from markupsafe import escape
|
|
16 | 18 | from ._compat import text_type
|
17 | 19 | from ._compat import url_quote
|
18 | 20 |
|
19 |
| -_word_split_re = re.compile(r"(\s+)") |
20 |
| -_punctuation_re = re.compile( |
21 |
| - "^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$" |
22 |
| - % ( |
23 |
| - "|".join(map(re.escape, ("(", "<", "<"))), |
24 |
| - "|".join(map(re.escape, (".", ",", ")", ">", "\n", ">"))), |
25 |
| - ) |
26 |
| -) |
27 |
| -_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$") |
28 |
| -_striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)") |
29 |
| -_entity_re = re.compile(r"&([^;]+);") |
30 |
| -_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" |
31 |
| -_digits = "0123456789" |
32 |
| - |
33 | 21 | # special singleton representing missing values for the runtime
|
34 | 22 | missing = type("MissingType", (), {"__repr__": lambda x: "missing"})()
|
35 | 23 |
|
@@ -210,48 +198,65 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
|
210 | 198 | and (x[:limit] + (len(x) >= limit and "..." or ""))
|
211 | 199 | or x
|
212 | 200 | )
|
213 |
| - words = _word_split_re.split(text_type(escape(text))) |
| 201 | + words = re.split(r"(\s+)", text_type(escape(text))) |
214 | 202 | rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or ""
|
215 | 203 | target_attr = target and ' target="%s"' % escape(target) or ""
|
216 | 204 |
|
217 | 205 | for i, word in enumerate(words):
|
218 |
| - match = _punctuation_re.match(word) |
| 206 | + head, middle, tail = "", word, "" |
| 207 | + match = re.match(r"^([(<]|<)+", middle) |
| 208 | + |
219 | 209 | if match:
|
220 |
| - lead, middle, trail = match.groups() |
221 |
| - if middle.startswith("www.") or ( |
222 |
| - "@" not in middle |
223 |
| - and not middle.startswith("http://") |
224 |
| - and not middle.startswith("https://") |
225 |
| - and len(middle) > 0 |
226 |
| - and middle[0] in _letters + _digits |
227 |
| - and ( |
228 |
| - middle.endswith(".org") |
229 |
| - or middle.endswith(".net") |
230 |
| - or middle.endswith(".com") |
231 |
| - ) |
232 |
| - ): |
233 |
| - middle = '<a href="http://%s"%s%s>%s</a>' % ( |
234 |
| - middle, |
235 |
| - rel_attr, |
236 |
| - target_attr, |
237 |
| - trim_url(middle), |
238 |
| - ) |
239 |
| - if middle.startswith("http://") or middle.startswith("https://"): |
240 |
| - middle = '<a href="%s"%s%s>%s</a>' % ( |
241 |
| - middle, |
242 |
| - rel_attr, |
243 |
| - target_attr, |
244 |
| - trim_url(middle), |
245 |
| - ) |
246 |
| - if ( |
247 |
| - "@" in middle |
248 |
| - and not middle.startswith("www.") |
249 |
| - and ":" not in middle |
250 |
| - and _simple_email_re.match(middle) |
251 |
| - ): |
252 |
| - middle = '<a href="mailto:%s">%s</a>' % (middle, middle) |
253 |
| - if lead + middle + trail != word: |
254 |
| - words[i] = lead + middle + trail |
| 210 | + head = match.group() |
| 211 | + middle = middle[match.end() :] |
| 212 | + |
| 213 | + # Unlike lead, which is anchored to the start of the string, |
| 214 | + # need to check that the string ends with any of the characters |
| 215 | + # before trying to match all of them, to avoid backtracking. |
| 216 | + if middle.endswith((")", ">", ".", ",", "\n", ">")): |
| 217 | + match = re.search(r"([)>.,\n]|>)+$", middle) |
| 218 | + |
| 219 | + if match: |
| 220 | + tail = match.group() |
| 221 | + middle = middle[: match.start()] |
| 222 | + |
| 223 | + if middle.startswith("www.") or ( |
| 224 | + "@" not in middle |
| 225 | + and not middle.startswith("http://") |
| 226 | + and not middle.startswith("https://") |
| 227 | + and len(middle) > 0 |
| 228 | + and middle[0] in _letters + _digits |
| 229 | + and ( |
| 230 | + middle.endswith(".org") |
| 231 | + or middle.endswith(".net") |
| 232 | + or middle.endswith(".com") |
| 233 | + ) |
| 234 | + ): |
| 235 | + middle = '<a href="http://%s"%s%s>%s</a>' % ( |
| 236 | + middle, |
| 237 | + rel_attr, |
| 238 | + target_attr, |
| 239 | + trim_url(middle), |
| 240 | + ) |
| 241 | + |
| 242 | + if middle.startswith("http://") or middle.startswith("https://"): |
| 243 | + middle = '<a href="%s"%s%s>%s</a>' % ( |
| 244 | + middle, |
| 245 | + rel_attr, |
| 246 | + target_attr, |
| 247 | + trim_url(middle), |
| 248 | + ) |
| 249 | + |
| 250 | + if ( |
| 251 | + "@" in middle |
| 252 | + and not middle.startswith("www.") |
| 253 | + and ":" not in middle |
| 254 | + and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle) |
| 255 | + ): |
| 256 | + middle = '<a href="mailto:%s">%s</a>' % (middle, middle) |
| 257 | + |
| 258 | + words[i] = head + middle + tail |
| 259 | + |
255 | 260 | return u"".join(words)
|
256 | 261 |
|
257 | 262 |
|
|
0 commit comments