developer.intra2net.com Git - pyi2ncommon/blob - src/log_read.py

   1 # The software in this package is distributed under the GNU General
   2 # Public License version 2 (with a special exception described below).
   3 #
   4 # A copy of GNU General Public License (GPL) is included in this distribution,
   5 # in the file COPYING.GPL.
   6 #
   7 # As a special exception, if other files instantiate templates or use macros
   8 # or inline functions from this file, or you compile this file and link it
   9 # with other works to produce a work based on this file, this file
  10 # does not by itself cause the resulting work to be covered
  11 # by the GNU General Public License.
  12 #
  13 # However the source code for this file must still be made available
  14 # in accordance with section (3) of the GNU General Public License.
  15 #
  16 # This exception does not invalidate any other reasons why a work based
  17 # on this file might be covered by the GNU General Public License.
  18 #
  19 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
  20
  21 """
  22
  23 SUMMARY
  24 ------------------------------------------------------
  25 Iterative reading of log files, similar to shell command `tail -f`.
  26
  27 Copyright: Intra2net AG
  28
  29
  30 CONTENTS
  31 ------------------------------------------------------
  32
  33 Basic Functionality (class :py:class:`IterativeReader`):
  34 Runs stat in a loop to find out whether file size has changed. Then reads the
  35 new data and forwards that
  36
  37 .. todo:: Want to also use lsof to find out whether file/pipe/socket was
  38           closed, so can automatically return from read loop.
  39
  40 :py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
  41 it line-wise as is normal for log files
  42
  43 :py:class:`LogParser` takes those lines and tries to parse them into fields
  44 like date, time, module name, urgency and message.
  45
  46 .. todo:: auto-detect log line layout
  47
  48
  49 INTERFACE
  50 ------------------------------------------------------
  51
  52 """
  53 import os
  54 import os.path
  55 import re
  56 from warnings import warn
  57 import logging
  58 from contextlib import contextmanager
  59 from .iter_helpers import zip_longest
  60 from .type_helpers import is_str_or_byte, is_file_obj
  61
  62
  63 class LogReadWarning(UserWarning):
  64     """Warnings issued by classes in this module."""
  65     pass
  66
  67
  68 def true_func(_):
  69     """Replacement for :py:func:`check_is_used`. Returns `True` always."""
  70     return True
  71
  72
  73 def false_func(_):
  74     """Replacement for :py:func:`check_is_used`. Returns `False` always."""
  75     return False
  76
  77
  78 def check_is_used(file_handle):
  79     """
  80     Check whether file is being written to.
  81
  82     To be implemented, e.g. using lsof.
  83
  84     If beneficial could also easily supply python file object as arg.
  85
  86     :param int file_handle: OS-level file descriptor
  87     """
  88     raise NotImplementedError(file_handle)
  89
  90
  91 #: counter for unknown sources in :py:func:`create_description`
  92 _create_description_unknown_counter = 0
  93
  94
  95 def create_description(file_obj, file_handle):
  96     """
  97     Create some description for given file-like object / file descriptor.
  98
  99     :param file_obj: file-like object
 100     :param int file_handle: os-level file descriptor
 101     :returns: Short description for file-like object
 102     :rtype: string
 103     """
 104     global _create_description_unknown_counter
 105
 106     try:
 107         desc = file_obj.name
 108         if desc:
 109             return desc
 110     except AttributeError:
 111         pass
 112
 113     if file_handle is not None:
 114         return 'file{0}'.format(file_handle)
 115     else:
 116         _create_description_unknown_counter += 1
 117         return 'unknown{0}'.format(_create_description_unknown_counter)
 118
 119
 120 #: error message for IterativeReader constructor
 121 _STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
 122            'files --> use with open(file_name)!'
 123
 124
 125 class IterativeReader(object):
 126     """
 127     Read continuously from a given file.
 128
 129     Use `os.stat(file_obj.fileno()).st_size` as measure whether file has
 130     changed or not; Always reads as much data as possible.
 131
 132     Does not care about closing files, so does not accept file names.
 133
 134     This is the base for class :py:class:`LineReader` that just has to
 135     implement a different :py:meth:`prepare_result` method.
 136     """
 137
 138     def __init__(self, sources, descs=None, keep_watching=False):
 139         """
 140         Create a reader; do some basic checks on args.
 141
 142         :param sources: iterable over sources. Sources can be opened file
 143                         objects or read-opened os-level file descriptors.
 144                         Calling code has to ensure they are closed properly, so
 145                         best use this within a "with open(file_name) as
 146                         file_handle:"-context. If sources is a single file
 147                         obj/descriptor, both source and desc will be converted
 148                         to lists of length 1
 149         :param descs: can be anything of same length as sources. If sources is
 150                       a single source, then descs is also converted to a list
 151                       of length 1. If not given (i.e. None), will use
 152                       :py:func:`create_description` to guess descriptions
 153         :param bool keep_watching: keep watching file that is not changing in
 154                                    size. Need to manually tell whether file
 155                                    is being written to or not since auto-detect
 156                                    is not implemented yet.
 157         :raises: OSError when testing fstat on source
 158         """
 159         if not sources:
 160             raise ValueError('need at least some source!')
 161         elif is_str_or_byte(sources):
 162             raise ValueError(_STR_ERR.format(sources))
 163         elif is_file_obj(sources) or isinstance(sources, int):
 164             source_input = [sources, ]
 165             desc_input = [descs, ]
 166         else:
 167             source_input = sources  # assume some iterable
 168             desc_input = descs
 169
 170         # now divide sources into os-level file descriptors for os.fstat,
 171         # and file objects for read()
 172         self.file_objs = []
 173         self.file_handles = []          # file descriptOR, not descriptION
 174         for source in source_input:
 175             if is_file_obj(source):
 176                 self.file_objs.append(source)
 177                 self.file_handles.append(source.fileno())
 178             elif isinstance(source, int):
 179                 self.file_objs.append(os.fdopen(source))
 180                 self.file_handles.append(source)
 181             elif is_str_or_byte(source):
 182                 raise ValueError(_STR_ERR.format(source))
 183             else:
 184                 raise ValueError('source {0} is neither file obj nor file '
 185                                  'descriptor!')
 186
 187             # try to fstat the new file descriptor just for testing
 188             os.fstat(self.file_handles[-1])
 189
 190         # guess descriptions if not given
 191         if not desc_input:
 192             self.descriptions = [create_description(obj, file_handle)
 193                                  for obj, file_handle
 194                                  in zip(self.file_objs, self.file_handles)]
 195         else:
 196             try:
 197                 if len(desc_input) != len(self.file_objs):
 198                     raise ValueError('need same number of sources and '
 199                                      'descriptions!')
 200             except TypeError:
 201                 pass  # desc_input is generator or so
 202
 203             self.descriptions = []
 204             for obj, file_handle, description in \
 205                     zip_longest(self.file_objs, self.file_handles, desc_input):
 206                 if obj is None:
 207                     raise ValueError('more descriptions than sources!')
 208                 elif description is None:
 209                     self.descriptions.append(create_description(obj,
 210                                                                 file_handle))
 211                 else:
 212                     self.descriptions.append(description)
 213
 214         self.last_sizes = [0 for _ in self.file_objs]
 215         self.ignore = [False for _ in self.file_objs]
 216
 217         if keep_watching:
 218             self.is_used_func = true_func
 219         else:
 220             self.is_used_func = false_func
 221         # use some day: self.is_used_func = check_is_used
 222
 223         for obj, file_handle, description in \
 224                 zip(self.file_objs, self.file_handles, self.descriptions):
 225             logging.debug('log_read initialized with file descriptor {0}, '
 226                           'file obj {1}, description "{2}"'
 227                           .format(file_handle, obj, description))
 228
 229     def n_sources(self):
 230         """Return number of sources given to constructor."""
 231         return len(self.file_objs)
 232
 233     def n_active_sources(self):
 234         """Return number of sources we are actually watching."""
 235         return len(self.ignore) - sum(self.ignore)
 236
 237     def __iter__(self):
 238         """
 239         Continue reading from sources, yield results.
 240
 241         yields result of :py:meth:`prepare_result`, which depends on what
 242         subclass you called this function from.
 243         """
 244         while True:
 245             if all(self.ignore):
 246                 break
 247
 248             for idx, (obj, file_handle, description, last_size, do_ignore) in \
 249                     enumerate(zip(self.file_objs, self.file_handles,
 250                                   self.descriptions, self.last_sizes,
 251                                   self.ignore)):
 252                 if do_ignore:
 253                     continue
 254
 255                 # get new file size
 256                 new_size = os.fstat(file_handle).st_size
 257
 258                 # compare to old size
 259                 if new_size == last_size:
 260                     if not self.is_used_func(file_handle):
 261                         self.ignore[idx] = True
 262                 else:
 263                     if new_size < last_size:  # happened at start of some tests
 264                         warn('{0} / {1} has become smaller ({2} --> {3})! '
 265                              .format(obj, description, last_size, new_size)
 266                              + 'Maybe you are reading from a half-initialized '
 267                              + 'file?',
 268                              category=LogReadWarning)
 269                     try:
 270                         new_data = obj.read()
 271                     except OSError as ose:    # includes IOErrors
 272                         warn('io error reading from {0} / {1}: {2})'
 273                              .format(obj, description, ose),
 274                              category=LogReadWarning)
 275                         new_data = str(ose)
 276                     except UnicodeDecodeError as ude:
 277                         warn('unicode error reading from {0} / {1}: {2}'
 278                              .format(obj, description, ude),
 279                              category=LogReadWarning)
 280                         new_data = str(ude)
 281
 282                     # post-processing
 283                     for result in self.prepare_result(description, new_data, idx):
 284                         yield result
 285
 286                     # prepare next iteration
 287                     self.last_sizes[idx] = new_size
 288
 289     def prepare_result(self, description, data, idx):
 290         """
 291         From raw new data create some yield-able results.
 292
 293         Intended for overwriting in subclasses.
 294
 295         This function is called from __iter__ for each new data that becomes
 296         available. It has to provide results which are forwarded to caller.
 297
 298         This base implementation just yields its input, so new data is yielded
 299         from `__iter__` as-is.
 300
 301         :param str description: Description of source of lines, one of
 302                                 :py:data:`self.descriptions`
 303         :param str data: Text data read from source
 304         :param idx: Index of data source
 305         :returns: nothing but yields [(description, data, idx], same as input
 306         """
 307         yield description, data, idx
 308
 309
 310 #: characters to `rstrip()` from end of complete lines
 311 LINE_SPLITTERS = '\n\r'
 312
 313
 314 class LineReader(IterativeReader):
 315     """
 316     An :py:class:`IterativeReader` that returns new data line-wise.
 317
 318     This means buffering partial line data.
 319     """
 320
 321     def __init__(self, *args, **kwargs):
 322         """Create an :py:class:`IterativeReader and buffers for sources."""
 323         super(LineReader, self).__init__(*args, **kwargs)
 324         self.line_buffers = ['' for _ in range(self.n_sources())]
 325
 326     def prepare_result(self, description, new_data, idx):
 327         """
 328         Take raw new data and split it into lines.
 329
 330         If line is not complete, then buffer it.
 331
 332         Args: see super class method :py:meth:`IterativeReader.prepare_result`
 333         :returns: list of 3-tuples `(description, line, idx)` where
 334                   `description` and `idx` are same as args, and `line` is
 335                   without trailing newline characters
 336         :rtype: [(str, str, int)]
 337         """
 338         all_data = self.line_buffers[idx] + new_data
 339         self.line_buffers[idx] = ''
 340         should_be_no_new_lines = False
 341         for line in all_data.splitlines(True):
 342             if line[-1] in LINE_SPLITTERS:
 343                 yield description, line.rstrip(LINE_SPLITTERS), idx
 344             elif should_be_no_new_lines:
 345                 # self-check
 346                 raise ValueError('Programming error: something went wrong with '
 347                                  'line splitting/buffering.')
 348             else:
 349                 self.line_buffers[idx] = line
 350                 should_be_no_new_lines = True  # (this should be the last)
 351
 352
 353 class LogParser(LineReader):
 354     """
 355     Takes lines from :py:class:`LineReader` and parses their contents.
 356
 357     Requires a pattern for log lines, auto-detection is not implemented yet.
 358
 359     Iteration returns :py:class:`re.match` result or -- if matching failed --
 360     None. The latest unparsed line is available as `self.last_unparsed_line`.
 361     Usage recommendation:
 362
 363         with open(log_file_name, 'rt') as file_handle:
 364             parser = log_read.LogParser(file_handle, pattern=my_pattern):
 365             for _, data, _ in parser:
 366                 if data is None:
 367                     print(f'Failed to parse line {parser.last_unparsed_line}')
 368                     continue
 369                 line_parts = data.groupdict()
 370                 ...do stuff with line_parts...
 371     """
 372
 373     def __init__(self, log_file, pattern=None):
 374         """
 375         Create a LogParser.
 376
 377         :param str log_file: name of log file to parse (required!)
 378         :param pattern: regexp to split log lines; None (default) to return
 379                         line as they are
 380         :type pattern: str or None (default)
 381         """
 382         super(LogParser, self).__init__(log_file)
 383
 384         self.pattern = pattern
 385         self.last_unparsed_line = ''
 386
 387     def prepare_result(self, *args):
 388         """
 389         Try parsing lines.
 390
 391         Args: see super class method :py:meth:`IterativeReader.prepare_result`
 392         :returns: 3-tuples `(description, line, idx)` where `description` and
 393                   `idx` are same as input args and `line` is either a
 394                   :py:class:`re.Match` if line matched :py:data:`self.pattern`
 395                   or just str if line did not match.
 396         :rtype: [(str, :py:class:`re.Match` OR str, int)]
 397         """
 398         # let super class split data into lines
 399         for description, raw_line, idx in \
 400                 super(LogParser, self).prepare_result(*args):
 401             matches = re.match(self.pattern, raw_line)
 402             if matches:
 403                 yield description, matches, idx
 404             else:
 405                 self.last_unparsed_line = raw_line
 406                 yield description, None, idx
 407
 408     @classmethod
 409     @contextmanager
 410     def create_for(cls, filename, *args, **kwargs):
 411         """
 412         Open single file, yield LogParser. Ensures file is closed afterwards.
 413
 414         This allows opening file and creation LogParser for it to one line:
 415
 416             with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser:
 417                 for _, matches, _ in parser:
 418                     try:
 419                         print(matches.groupdict())
 420                     except Exception:
 421                         print(f'UNPARSED: {parser.last_unparsed_line}')
 422
 423         :param str filename: something that :py:meth:`open` accepts
 424         :param args: Forwarded to constructor
 425         :param kwargs: Forwarded to constructor
 426         """
 427         with open(filename) as file_handle:
 428             yield cls(file_handle, *args, **kwargs)
 429
 430
 431 ################################################################################
 432 # PATTERNS FOR FREQUENT LOG FILES
 433 ################################################################################
 434
 435 # pattern of squid proxy logs. group names are best guesses
 436 PROXY_LOG_PATTERN = \
 437     r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \
 438     + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \
 439     + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \
 440     + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*'
 441
 442 # pattern for linux system logs (usually "messages" or "syslog" also "maillog"
 443 SYS_LOG_PATTERN = \
 444     r'\s*(?P<timestamp>\w{3} +\d{2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \
 445     + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)'