Implement cache for pwd.getpwuid() and grp.getgrgid()
authorThomas Jarosch <thomas.jarosch@intra2net.com>
Thu, 23 Jun 2016 08:08:16 +0000 (10:08 +0200)
committerThomas Jarosch <thomas.jarosch@intra2net.com>
Thu, 23 Jun 2016 09:17:07 +0000 (11:17 +0200)
Those functions always parse /etc/passwd and we
look up the owner for each file we backup.

This change is only relevant when creating full backups.
Speed up with ~1.000.000 emails is 11%.

deltatar/tarfile.py

index f63b6ad..8ff3d9d 100644 (file)
@@ -1747,6 +1747,9 @@ class TarFile(object):
                                 # if you manage lots of files and don't want
                                 # to have high memory usage
 
+    cache_uid2user = {}         # cache to avoid getpwuid calls. It always parses /etc/passwd.
+    cache_gid2group = {}        # same cache for groups
+
     def __init__(self, name=None, mode="r", fileobj=None, format=None,
             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
             errors="surrogateescape", pax_headers=None, debug=None,
@@ -2256,15 +2259,27 @@ class TarFile(object):
         tarinfo.type = type
         tarinfo.linkname = linkname
         if pwd:
-            try:
-                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
-            except KeyError:
-                pass
+            if tarinfo.uid in self.cache_uid2user:
+                tarinfo.uname = self.cache_uid2user[tarinfo.uid]
+            else:
+                try:
+                    tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
+                    self.cache_uid2user[tarinfo.uid] = tarinfo.uname
+                except KeyError:
+                    # remember user does not exist:
+                    # same default value as in tarinfo class
+                    self.cache_uid2user[tarinfo.uid] = ""
         if grp:
-            try:
-                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
-            except KeyError:
-                pass
+            if tarinfo.gid in self.cache_gid2group:
+                tarinfo.gname = self.cache_gid2group[tarinfo.gid]
+            else:
+                try:
+                    tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
+                    self.cache_gid2group[tarinfo.gid] = tarinfo.gname
+                except KeyError:
+                    # remember group does not exist:
+                    # same default value as in tarinfo class
+                    self.cache_gid2group[tarinfo.gid] = ""
 
         if type in (CHRTYPE, BLKTYPE):
             if hasattr(os, "major") and hasattr(os, "minor"):