From c5eee91d78377aa6fecc44d3cfe06cd1dfebb589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Tue, 2 Jul 2024 15:35:20 +0200 Subject: [PATCH 1/6] gh-121267: Improve performance of tarfile (#121267) Tarfile in the default write mode spends much of its time resolving UIDs into usernames and GIDs into group names. By caching these mappings, a significant speedup can be achieved. In my simple benchmark[1], this extra caching speeds up tarfile by 8x. [1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2 --- Lib/tarfile.py | 27 +++++++++++++------ ...-07-02-15-56-42.gh-issue-121267.yFBWkh.rst | 2 ++ 2 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index d5d8a469779f50..5f66e9b606ff02 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1727,6 +1727,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None, # current position in the archive file self.inodes = {} # dictionary caching the inodes of # archive members already added + self.uname_cache = {} # Cached mappings of uid -> uname, gid -> gname + self.gname_cache = {} try: if self.mode == "r": @@ -2105,16 +2107,25 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): tarinfo.mtime = statres.st_mtime tarinfo.type = type tarinfo.linkname = linkname + + # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To + # speed things up, cache the resolved usernames and group names. if pwd: - try: - tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] - except KeyError: - pass + if not tarinfo.uid in self.uname_cache: + try: + self.uname_cache[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] + except KeyError: + pass + + tarinfo.uname = self.uname_cache.get(tarinfo.uid, None) if grp: - try: - tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] - except KeyError: - pass + if not tarinfo.gid in self.gname_cache: + try: + self.gname_cache[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] + except KeyError: + pass + + tarinfo.gname = self.gname_cache.get(tarinfo.gid, None) if type in (CHRTYPE, BLKTYPE): if hasattr(os, "major") and hasattr(os, "minor"): diff --git a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst new file mode 100644 index 00000000000000..ca18bf37471bad --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst @@ -0,0 +1,2 @@ +Improve the performance of tarfile when writing files, by caching user names +and group names. From 2e72fc7be202c371732eb0763a0f3bd0f57bc127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Wed, 3 Jul 2024 22:13:52 +0200 Subject: [PATCH 2/6] Apply suggestions from gaogaotiantian (1/2) Co-authored-by: Tian Gao --- Lib/tarfile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 5f66e9b606ff02..08d8c6453f295e 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2111,13 +2111,12 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To # speed things up, cache the resolved usernames and group names. if pwd: - if not tarinfo.uid in self.uname_cache: + if tarinfo.uid not in self.uname_cache: try: self.uname_cache[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] except KeyError: - pass - - tarinfo.uname = self.uname_cache.get(tarinfo.uid, None) + self.uname_cache[tarinfo.uid] = None + tarinfo.uname = self.uname_cache[tarinfo.uid] if grp: if not tarinfo.gid in self.gname_cache: try: From dc117ae6c90cf36b82239c8c41e838bf5050f527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Wed, 3 Jul 2024 22:14:03 +0200 Subject: [PATCH 3/6] Apply suggestions from gaogaotiantian (2/2) Co-authored-by: Tian Gao --- Lib/tarfile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 08d8c6453f295e..aa328dcd726aa2 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2118,13 +2118,12 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): self.uname_cache[tarinfo.uid] = None tarinfo.uname = self.uname_cache[tarinfo.uid] if grp: - if not tarinfo.gid in self.gname_cache: + if tarinfo.gid not in self.gname_cache: try: self.gname_cache[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] except KeyError: - pass - - tarinfo.gname = self.gname_cache.get(tarinfo.gid, None) + self.gname_cache[tarinfo.gid] = None + tarinfo.gname = self.gname_cache[tarinfo.gid] if type in (CHRTYPE, BLKTYPE): if hasattr(os, "major") and hasattr(os, "minor"): From f8bd4531cbe2b59c66809f61f64304ed94861052 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Mon, 8 Jul 2024 15:00:31 +0200 Subject: [PATCH 4/6] Apply suggestions from picnixz (1/2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- .../next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst index ca18bf37471bad..9e52405c15a82d 100644 --- a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst +++ b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst @@ -1,2 +1,2 @@ -Improve the performance of tarfile when writing files, by caching user names +Improve the performance of :mod:`tarfile` when writing files, by caching user names and group names. From 454f94a27349ea97ec1d1e8364700ec22ff9bc21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Mon, 8 Jul 2024 15:12:57 +0200 Subject: [PATCH 5/6] Apply suggestions from picnixz (2/2) --- Lib/tarfile.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index aa328dcd726aa2..fbd22764f9ff78 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1727,8 +1727,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None, # current position in the archive file self.inodes = {} # dictionary caching the inodes of # archive members already added - self.uname_cache = {} # Cached mappings of uid -> uname, gid -> gname - self.gname_cache = {} + self._unames = {} # Cached mappings of uid -> uname + self._gnames = {} # Cached mappings of gid -> gname try: if self.mode == "r": @@ -2111,19 +2111,19 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To # speed things up, cache the resolved usernames and group names. if pwd: - if tarinfo.uid not in self.uname_cache: + if tarinfo.uid not in self._unames: try: - self.uname_cache[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] + self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] except KeyError: - self.uname_cache[tarinfo.uid] = None - tarinfo.uname = self.uname_cache[tarinfo.uid] + self._unames[tarinfo.uid] = None + tarinfo.uname = self._unames[tarinfo.uid] if grp: - if tarinfo.gid not in self.gname_cache: + if tarinfo.gid not in self._gnames: try: - self.gname_cache[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] + self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] except KeyError: - self.gname_cache[tarinfo.gid] = None - tarinfo.gname = self.gname_cache[tarinfo.gid] + self._gnames[tarinfo.gid] = None + tarinfo.gname = self._gnames[tarinfo.gid] if type in (CHRTYPE, BLKTYPE): if hasattr(os, "major") and hasattr(os, "minor"): From 8cf9e54c1cb98ef336829856b59bab1a03d86717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Mon, 26 Aug 2024 16:56:54 +0200 Subject: [PATCH 6/6] Apply suggestion from hauntsaninja --- Lib/tarfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index d723c72b7d2f47..6940d8e3f8a5da 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2115,14 +2115,14 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): try: self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] except KeyError: - self._unames[tarinfo.uid] = None + self._unames[tarinfo.uid] = '' tarinfo.uname = self._unames[tarinfo.uid] if grp: if tarinfo.gid not in self._gnames: try: self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] except KeyError: - self._gnames[tarinfo.gid] = None + self._gnames[tarinfo.gid] = '' tarinfo.gname = self._gnames[tarinfo.gid] if type in (CHRTYPE, BLKTYPE):