developer.intra2net.com Git - pyi2ncommon/blob - src/log_read.py

   1 # The software in this package is distributed under the GNU General
   2 # Public License version 2 (with a special exception described below).
   3 #
   4 # A copy of GNU General Public License (GPL) is included in this distribution,
   5 # in the file COPYING.GPL.
   6 #
   7 # As a special exception, if other files instantiate templates or use macros
   8 # or inline functions from this file, or you compile this file and link it
   9 # with other works to produce a work based on this file, this file
  10 # does not by itself cause the resulting work to be covered
  11 # by the GNU General Public License.
  12 #
  13 # However the source code for this file must still be made available
  14 # in accordance with section (3) of the GNU General Public License.
  15 #
  16 # This exception does not invalidate any other reasons why a work based
  17 # on this file might be covered by the GNU General Public License.
  18 #
  19 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
  20
  21 """
  22 Iterative reading of log files, similar to shell command `tail -f`.
  23
  24 Copyright: Intra2net AG
  25
  26 Basic Functionality (class :py:class:`IterativeReader`):
  27 Runs stat in a loop to find out whether file size has changed. Then reads the
  28 new data and forwards that
  29
  30 .. todo:: Want to also use lsof to find out whether file/pipe/socket was
  31           closed, so can automatically return from read loop.
  32
  33 :py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
  34 it line-wise as is normal for log files
  35
  36 :py:class:`LogParser` takes those lines and tries to parse them into fields
  37 like date, time, module name, urgency and message.
  38
  39 .. todo:: auto-detect log line layout
  40 """
  41 import os
  42 import os.path
  43 import re
  44 from warnings import warn
  45 import logging
  46 from contextlib import contextmanager
  47 from .iter_helpers import zip_longest
  48 from .type_helpers import is_str_or_byte, is_file_obj
  49
  50
  51 class LogReadWarning(UserWarning):
  52     """Warnings issued by classes in this module."""
  53     pass
  54
  55
  56 def true_func(_):
  57     """Replacement for :py:func:`check_is_used`. Returns `True` always."""
  58     return True
  59
  60
  61 def false_func(_):
  62     """Replacement for :py:func:`check_is_used`. Returns `False` always."""
  63     return False
  64
  65
  66 def check_is_used(file_handle):
  67     """
  68     Check whether file is being written to.
  69
  70     To be implemented, e.g. using lsof.
  71
  72     If beneficial could also easily supply python file object as arg.
  73
  74     :param int file_handle: OS-level file descriptor
  75     """
  76     raise NotImplementedError(file_handle)
  77
  78
  79 #: counter for unknown sources in :py:func:`create_description`
  80 _create_description_unknown_counter = 0
  81
  82
  83 def create_description(file_obj, file_handle):
  84     """
  85     Create some description for given file-like object / file descriptor.
  86
  87     :param file_obj: file-like object
  88     :param int file_handle: os-level file descriptor
  89     :returns: Short description for file-like object
  90     :rtype: string
  91     """
  92     global _create_description_unknown_counter
  93
  94     try:
  95         desc = file_obj.name
  96         if desc:
  97             return desc
  98     except AttributeError:
  99         pass
 100
 101     if file_handle is not None:
 102         return 'file{0}'.format(file_handle)
 103     else:
 104         _create_description_unknown_counter += 1
 105         return 'unknown{0}'.format(_create_description_unknown_counter)
 106
 107
 108 #: error message for IterativeReader constructor
 109 _STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
 110            'files --> use with open(file_name)!'
 111
 112
 113 class IterativeReader(object):
 114     """
 115     Read continuously from a given file.
 116
 117     Use `os.stat(file_obj.fileno()).st_size` as measure whether file has
 118     changed or not; Always reads as much data as possible.
 119
 120     Does not care about closing files, so does not accept file names.
 121
 122     This is the base for class :py:class:`LineReader` that just has to
 123     implement a different :py:meth:`prepare_result` method.
 124     """
 125
 126     def __init__(self, sources, descs=None, keep_watching=False):
 127         """
 128         Create a reader; do some basic checks on args.
 129
 130         :param sources: iterable over sources. Sources can be opened file
 131                         objects or read-opened os-level file descriptors.
 132                         Calling code has to ensure they are closed properly, so
 133                         best use this within a "with open(file_name) as
 134                         file_handle:"-context. If sources is a single file
 135                         obj/descriptor, both source and desc will be converted
 136                         to lists of length 1
 137         :param descs: can be anything of same length as sources. If sources is
 138                       a single source, then descs is also converted to a list
 139                       of length 1. If not given (i.e. None), will use
 140                       :py:func:`create_description` to guess descriptions
 141         :param bool keep_watching: keep watching file that is not changing in
 142                                    size. Need to manually tell whether file
 143                                    is being written to or not since auto-detect
 144                                    is not implemented yet.
 145         :raises: OSError when testing fstat on source
 146         """
 147         if not sources:
 148             raise ValueError('need at least some source!')
 149         elif is_str_or_byte(sources):
 150             raise ValueError(_STR_ERR.format(sources))
 151         elif is_file_obj(sources) or isinstance(sources, int):
 152             source_input = [sources, ]
 153             desc_input = [descs, ]
 154         else:
 155             source_input = sources  # assume some iterable
 156             desc_input = descs
 157
 158         # now divide sources into os-level file descriptors for os.fstat,
 159         # and file objects for read()
 160         self.file_objs = []
 161         self.file_handles = []          # file descriptOR, not descriptION
 162         for source in source_input:
 163             if is_file_obj(source):
 164                 self.file_objs.append(source)
 165                 self.file_handles.append(source.fileno())
 166             elif isinstance(source, int):
 167                 self.file_objs.append(os.fdopen(source))
 168                 self.file_handles.append(source)
 169             elif is_str_or_byte(source):
 170                 raise ValueError(_STR_ERR.format(source))
 171             else:
 172                 raise ValueError('source {0} is neither file obj nor file '
 173                                  'descriptor!')
 174
 175             # try to fstat the new file descriptor just for testing
 176             os.fstat(self.file_handles[-1])
 177
 178         # guess descriptions if not given
 179         if not desc_input:
 180             self.descriptions = [create_description(obj, file_handle)
 181                                  for obj, file_handle
 182                                  in zip(self.file_objs, self.file_handles)]
 183         else:
 184             try:
 185                 if len(desc_input) != len(self.file_objs):
 186                     raise ValueError('need same number of sources and '
 187                                      'descriptions!')
 188             except TypeError:
 189                 pass  # desc_input is generator or so
 190
 191             self.descriptions = []
 192             for obj, file_handle, description in \
 193                     zip_longest(self.file_objs, self.file_handles, desc_input):
 194                 if obj is None:
 195                     raise ValueError('more descriptions than sources!')
 196                 elif description is None:
 197                     self.descriptions.append(create_description(obj,
 198                                                                 file_handle))
 199                 else:
 200                     self.descriptions.append(description)
 201
 202         self.last_sizes = [0 for _ in self.file_objs]
 203         self.ignore = [False for _ in self.file_objs]
 204
 205         if keep_watching:
 206             self.is_used_func = true_func
 207         else:
 208             self.is_used_func = false_func
 209         # use some day: self.is_used_func = check_is_used
 210
 211         for obj, file_handle, description in \
 212                 zip(self.file_objs, self.file_handles, self.descriptions):
 213             logging.debug('log_read initialized with file descriptor {0}, '
 214                           'file obj {1}, description "{2}"'
 215                           .format(file_handle, obj, description))
 216
 217     def n_sources(self):
 218         """Return number of sources given to constructor."""
 219         return len(self.file_objs)
 220
 221     def n_active_sources(self):
 222         """Return number of sources we are actually watching."""
 223         return len(self.ignore) - sum(self.ignore)
 224
 225     def __iter__(self):
 226         """
 227         Continue reading from sources, yield results.
 228
 229         yields result of :py:meth:`prepare_result`, which depends on what
 230         subclass you called this function from.
 231         """
 232         while True:
 233             if all(self.ignore):
 234                 break
 235
 236             for idx, (obj, file_handle, description, last_size, do_ignore) in \
 237                     enumerate(zip(self.file_objs, self.file_handles,
 238                                   self.descriptions, self.last_sizes,
 239                                   self.ignore)):
 240                 if do_ignore:
 241                     continue
 242
 243                 # get new file size
 244                 new_size = os.fstat(file_handle).st_size
 245
 246                 # compare to old size
 247                 if new_size == last_size:
 248                     if not self.is_used_func(file_handle):
 249                         self.ignore[idx] = True
 250                 else:
 251                     if new_size < last_size:  # happened at start of some tests
 252                         warn('{0} / {1} has become smaller ({2} --> {3})! '
 253                              .format(obj, description, last_size, new_size)
 254                              + 'Maybe you are reading from a half-initialized '
 255                              + 'file?',
 256                              category=LogReadWarning)
 257                     try:
 258                         new_data = obj.read()
 259                     except OSError as ose:    # includes IOErrors
 260                         warn('io error reading from {0} / {1}: {2})'
 261                              .format(obj, description, ose),
 262                              category=LogReadWarning)
 263                         new_data = str(ose)
 264                     except UnicodeDecodeError as ude:
 265                         warn('unicode error reading from {0} / {1}: {2}'
 266                              .format(obj, description, ude),
 267                              category=LogReadWarning)
 268                         new_data = str(ude)
 269
 270                     # post-processing
 271                     for result in self.prepare_result(description, new_data, idx):
 272                         yield result
 273
 274                     # prepare next iteration
 275                     self.last_sizes[idx] = new_size
 276
 277     def prepare_result(self, description, data, idx):
 278         """
 279         From raw new data create some yield-able results.
 280
 281         Intended for overwriting in subclasses.
 282
 283         This function is called from `__iter__` for each new data that becomes
 284         available. It has to provide results which are forwarded to caller.
 285
 286         This base implementation just yields its input, so new data is yielded
 287         from `__iter__` as-is.
 288
 289         :param str description: Description of source of lines, one of
 290                                 :py:data:`self.descriptions`
 291         :param str data: Text data read from source
 292         :param idx: Index of data source
 293         :returns: nothing but yields [(description, data, idx)], same as input
 294         """
 295         yield description, data, idx
 296
 297
 298 #: characters to `rstrip()` from end of complete lines
 299 LINE_SPLITTERS = '\n\r'
 300
 301
 302 class LineReader(IterativeReader):
 303     """
 304     An :py:class:`IterativeReader` that returns new data line-wise.
 305
 306     This means buffering partial line data.
 307     """
 308
 309     def __init__(self, *args, **kwargs):
 310         """Create an :py:class:`IterativeReader and buffers for sources."""
 311         super(LineReader, self).__init__(*args, **kwargs)
 312         self.line_buffers = ['' for _ in range(self.n_sources())]
 313
 314     def prepare_result(self, description, new_data, idx):
 315         """
 316         Take raw new data and split it into lines.
 317
 318         If line is not complete, then buffer it.
 319
 320         Args: see super class method :py:meth:`IterativeReader.prepare_result`
 321
 322         :returns: list of 3-tuples `(description, line, idx)` where
 323                   `description` and `idx` are same as args, and `line` is
 324                   without trailing newline characters
 325         :rtype: [(str, str, int)]
 326         """
 327         all_data = self.line_buffers[idx] + new_data
 328         self.line_buffers[idx] = ''
 329         should_be_no_new_lines = False
 330         for line in all_data.splitlines(True):
 331             if line[-1] in LINE_SPLITTERS:
 332                 yield description, line.rstrip(LINE_SPLITTERS), idx
 333             elif should_be_no_new_lines:
 334                 # self-check
 335                 raise ValueError('Programming error: something went wrong with '
 336                                  'line splitting/buffering.')
 337             else:
 338                 self.line_buffers[idx] = line
 339                 should_be_no_new_lines = True  # (this should be the last)
 340
 341
 342 class LogParser(LineReader):
 343     """
 344     Takes lines from :py:class:`LineReader` and parses their contents.
 345
 346     Requires a pattern for log lines, auto-detection is not implemented yet.
 347
 348     Iteration returns :py:class:`re.match` result or -- if matching failed --
 349     None. The latest unparsed line is available as `self.last_unparsed_line`.
 350     Usage recommendation::
 351
 352         with open(log_file_name, 'rt') as file_handle:
 353             parser = log_read.LogParser(file_handle, pattern=my_pattern):
 354             for _, data, _ in parser:
 355                 if data is None:
 356                     print(f'Failed to parse line {parser.last_unparsed_line}')
 357                     continue
 358                 line_parts = data.groupdict()
 359                 ...do stuff with line_parts...
 360     """
 361
 362     def __init__(self, log_file, pattern=None):
 363         """
 364         Create a LogParser.
 365
 366         :param str log_file: name of log file to parse (required!)
 367         :param pattern: regexp to split log lines; None (default) to return
 368                         line as they are
 369         :type pattern: str or None (default)
 370         """
 371         super(LogParser, self).__init__(log_file)
 372
 373         self.pattern = pattern
 374         self.last_unparsed_line = ''
 375
 376     def prepare_result(self, *args):
 377         """
 378         Try parsing lines.
 379
 380         Args: see super class method :py:meth:`IterativeReader.prepare_result`
 381
 382         :returns: 3-tuples `(description, line, idx)` where `description` and
 383                   `idx` are same as input args and `line` is either a
 384                   :py:class:`re.Match` if line matched :py:data:`self.pattern`
 385                   or just str if line did not match.
 386         :rtype: [(str, :py:class:`re.Match` OR str, int)]
 387         """
 388         # let super class split data into lines
 389         for description, raw_line, idx in \
 390                 super(LogParser, self).prepare_result(*args):
 391             matches = re.match(self.pattern, raw_line)
 392             if matches:
 393                 yield description, matches, idx
 394             else:
 395                 self.last_unparsed_line = raw_line
 396                 yield description, None, idx
 397
 398     @classmethod
 399     @contextmanager
 400     def create_for(cls, filename, *args, **kwargs):
 401         """
 402         Open single file, yield LogParser. Ensures file is closed afterwards.
 403
 404         This allows opening file and creation LogParser for it to one line::
 405
 406             with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser:
 407                 for _, matches, _ in parser:
 408                     try:
 409                         print(matches.groupdict())
 410                     except Exception:
 411                         print(f'UNPARSED: {parser.last_unparsed_line}')
 412
 413         :param str filename: something that :py:meth:`open` accepts
 414         :param args: Forwarded to constructor
 415         :param kwargs: Forwarded to constructor
 416         """
 417         with open(filename) as file_handle:
 418             yield cls(file_handle, *args, **kwargs)
 419
 420
 421 ################################################################################
 422 # PATTERNS FOR FREQUENT LOG FILES
 423 ################################################################################
 424
 425 # pattern of squid proxy logs. group names are best guesses
 426 PROXY_LOG_PATTERN = \
 427     r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \
 428     + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \
 429     + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \
 430     + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*'
 431
 432 # pattern for linux system logs (usually "messages" or "syslog" also "maillog"
 433 SYS_LOG_PATTERN = \
 434     r'\s*(?P<timestamp>\w{3} +\d{2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \
 435     + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)'