Skip to content

Commit c5eee91

Browse files
committed
gh-121267: Improve performance of tarfile (#121267)
Tarfile in the default write mode spends much of its time resolving UIDs into usernames and GIDs into group names. By caching these mappings, a significant speedup can be achieved. In my simple benchmark[1], this extra caching speeds up tarfile by 8x. [1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2
1 parent e245ed7 commit c5eee91

File tree

2 files changed

+21
-8
lines changed

2 files changed

+21
-8
lines changed

Lib/tarfile.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1727,6 +1727,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None,
17271727
# current position in the archive file
17281728
self.inodes = {} # dictionary caching the inodes of
17291729
# archive members already added
1730+
self.uname_cache = {} # Cached mappings of uid -> uname, gid -> gname
1731+
self.gname_cache = {}
17301732

17311733
try:
17321734
if self.mode == "r":
@@ -2105,16 +2107,25 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None):
21052107
tarinfo.mtime = statres.st_mtime
21062108
tarinfo.type = type
21072109
tarinfo.linkname = linkname
2110+
2111+
# Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
2112+
# speed things up, cache the resolved usernames and group names.
21082113
if pwd:
2109-
try:
2110-
tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2111-
except KeyError:
2112-
pass
2114+
if not tarinfo.uid in self.uname_cache:
2115+
try:
2116+
self.uname_cache[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
2117+
except KeyError:
2118+
pass
2119+
2120+
tarinfo.uname = self.uname_cache.get(tarinfo.uid, None)
21132121
if grp:
2114-
try:
2115-
tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2116-
except KeyError:
2117-
pass
2122+
if not tarinfo.gid in self.gname_cache:
2123+
try:
2124+
self.gname_cache[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
2125+
except KeyError:
2126+
pass
2127+
2128+
tarinfo.gname = self.gname_cache.get(tarinfo.gid, None)
21182129

21192130
if type in (CHRTYPE, BLKTYPE):
21202131
if hasattr(os, "major") and hasattr(os, "minor"):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improve the performance of tarfile when writing files, by caching user names
2+
and group names.

0 commit comments

Comments
 (0)