Merge branch 'log-parser'

[pyi2ncommon] / src / log_read.py
diff --git a/src/log_read.py b/src/log_read.py

index 1d8bef4..724f4e1 100644 (file)
--- a/src/log_read.py
+++ b/src/log_read.py
@@ -35,7 +35,7 @@ Runs stat in a loop to find out whether file size has changed. Then reads the
 new data and forwards that
 
 .. todo:: Want to also use lsof to find out whether file/pipe/socket was
-          closed, so can return from read loop
+          closed, so can automatically return from read loop.
 
 :py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
 it line-wise as is normal for log files
@@ -50,12 +50,12 @@ INTERFACE
 ------------------------------------------------------
 
 """
-
 import os
 import os.path
 import re
 from warnings import warn
 import logging
+from contextlib import contextmanager
 from .iter_helpers import zip_longest
 from .type_helpers import is_str_or_byte, is_file_obj
 
@@ -70,6 +70,11 @@ def true_func(_):
     return True
 
 
+def false_func(_):
+    """Replacement for :py:func:`check_is_used`. Returns `False` always."""
+    return False
+
+
 def check_is_used(file_handle):
     """
     Check whether file is being written to.
@@ -130,7 +135,7 @@ class IterativeReader(object):
     implement a different :py:meth:`prepare_result` method.
     """
 
-    def __init__(self, sources, descs=None, return_when_done=False):
+    def __init__(self, sources, descs=None, keep_watching=False):
         """
         Create a reader; do some basic checks on args.
 
@@ -145,10 +150,10 @@ class IterativeReader(object):
                       a single source, then descs is also converted to a list
                       of length 1. If not given (i.e. None), will use
                       :py:func:`create_description` to guess descriptions
-        :param bool return_when_done: ignore file_handle if no-one is writing
-                                      to it any more. Return from iterator when
-                                      all watched files are done (not
-                                      implemented yet)
+        :param bool keep_watching: keep watching file that is not changing in
+                                   size. Need to manually tell whether file
+                                   is being written to or not since auto-detect
+                                   is not implemented yet.
         :raises: OSError when testing fstat on source
         """
         if not sources:
@@ -209,10 +214,11 @@ class IterativeReader(object):
         self.last_sizes = [0 for _ in self.file_objs]
         self.ignore = [False for _ in self.file_objs]
 
-        if return_when_done:
-            self.is_used_func = check_is_used
-        else:
+        if keep_watching:
             self.is_used_func = true_func
+        else:
+            self.is_used_func = false_func
+        # use some day: self.is_used_func = check_is_used
 
         for obj, file_handle, description in \
                 zip(self.file_objs, self.file_handles, self.descriptions):
@@ -232,10 +238,13 @@ class IterativeReader(object):
         """
         Continue reading from sources, yield results.
 
-        yields result of :py:meth:`prepare_result`, which depends on what sub
-        class you called this function from.
+        yields result of :py:meth:`prepare_result`, which depends on what
+        subclass you called this function from.
         """
         while True:
+            if all(self.ignore):
+                break
+
             for idx, (obj, file_handle, description, last_size, do_ignore) in \
                     enumerate(zip(self.file_objs, self.file_handles,
                                   self.descriptions, self.last_sizes,
@@ -249,10 +258,6 @@ class IterativeReader(object):
                 # compare to old size
                 if new_size == last_size:
                     if not self.is_used_func(file_handle):
-                        warn('no one is writing to {0} / {1} -- '
-                             'stop watching it!'
-                             .format(file_handle, description),
-                             category=LogReadWarning)
                         self.ignore[idx] = True
                 else:
                     if new_size < last_size:  # happened at start of some tests
@@ -275,8 +280,7 @@ class IterativeReader(object):
                         new_data = str(ude)
 
                     # post-processing
-                    to_yield = self.prepare_result(description, new_data, idx)
-                    for result in to_yield:
+                    for result in self.prepare_result(description, new_data, idx):
                         yield result
 
                     # prepare next iteration
@@ -289,21 +293,18 @@ class IterativeReader(object):
         Intended for overwriting in subclasses.
 
         This function is called from __iter__ for each new data that becomes
-        available. It has to return some iterable whose entries are yielded
-        from iteration over objects of this class.
+        available. It has to provide results which are forwarded to caller.
 
-        This base implementation just returns its input in a list, so new data
-        is yielded from __iter__ as-is. Subclass implementations can also yield
-        tuples.
+        This base implementation just yields its input, so new data is yielded
+        from `__iter__` as-is.
 
         :param str description: Description of source of lines, one of
                                 :py:data:`self.descriptions`
         :param str data: Text data read from source
         :param idx: Index of data source
-        :returns: [(description, data, idx], same as input
-        :rtype [(str, str, int)]
+        :returns: nothing but yields [(description, data, idx], same as input
         """
-        return [(description, data, idx), ]
+        yield description, data, idx
 
 
 #: characters to `rstrip()` from end of complete lines
@@ -336,11 +337,10 @@ class LineReader(IterativeReader):
         """
         all_data = self.line_buffers[idx] + new_data
         self.line_buffers[idx] = ''
-        result = []
         should_be_no_new_lines = False
         for line in all_data.splitlines(True):
             if line[-1] in LINE_SPLITTERS:
-                result.append((description, line.rstrip(LINE_SPLITTERS), idx))
+                yield description, line.rstrip(LINE_SPLITTERS), idx
             elif should_be_no_new_lines:
                 # self-check
                 raise ValueError('Programming error: something went wrong with '
@@ -349,8 +349,6 @@ class LineReader(IterativeReader):
                 self.line_buffers[idx] = line
                 should_be_no_new_lines = True  # (this should be the last)
 
-        return result
-
 
 class LogParser(LineReader):
     """
@@ -358,8 +356,18 @@ class LogParser(LineReader):
 
     Requires a pattern for log lines, auto-detection is not implemented yet.
 
-    Iteration returns re.match result or -- if matching failed -- the original
-    raw line.
+    Iteration returns :py:class:`re.match` result or -- if matching failed --
+    None. The latest unparsed line is available as `self.last_unparsed_line`.
+    Usage recommendation:
+
+        with open(log_file_name, 'rt') as file_handle:
+            parser = log_read.LogParser(file_handle, pattern=my_pattern):
+            for _, data, _ in parser:
+                if data is None:
+                    print(f'Failed to parse line {parser.last_unparsed_line}')
+                    continue
+                line_parts = data.groupdict()
+                ...do stuff with line_parts...
     """
 
     def __init__(self, log_file, pattern=None):
@@ -374,6 +382,7 @@ class LogParser(LineReader):
         super(LogParser, self).__init__(log_file)
 
         self.pattern = pattern
+        self.last_unparsed_line = ''
 
     def prepare_result(self, *args):
         """
@@ -389,8 +398,48 @@ class LogParser(LineReader):
         # let super class split data into lines
         for description, raw_line, idx in \
                 super(LogParser, self).prepare_result(*args):
-            result = re.match(self.pattern, raw_line)
-            if result:
-                return description, result, idx
+            matches = re.match(self.pattern, raw_line)
+            if matches:
+                yield description, matches, idx
             else:
-                return description, raw_line, idx
+                self.last_unparsed_line = raw_line
+                yield description, None, idx
+
+    @classmethod
+    @contextmanager
+    def create_for(cls, filename, *args, **kwargs):
+        """
+        Open single file, yield LogParser. Ensures file is closed afterwards.
+
+        This allows opening file and creation LogParser for it to one line:
+
+            with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser:
+                for _, matches, _ in parser:
+                    try:
+                        print(matches.groupdict())
+                    except Exception:
+                        print(f'UNPARSED: {parser.last_unparsed_line}')
+
+        :param str filename: something that :py:meth:`open` accepts
+        :param args: Forwarded to constructor
+        :param kwargs: Forwarded to constructor
+        """
+        with open(filename) as file_handle:
+            yield cls(file_handle, *args, **kwargs)
+
+
+################################################################################
+# PATTERNS FOR FREQUENT LOG FILES
+################################################################################
+
+# pattern of squid proxy logs. group names are best guesses
+PROXY_LOG_PATTERN = \
+    r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \
+    + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \
+    + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \
+    + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*'
+
+# pattern for linux system logs (usually "messages" or "syslog" also "maillog"
+SYS_LOG_PATTERN = \
+    r'\s*(?P<timestamp>\w{3} +\d{2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \
+    + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)'