| 1 | # The software in this package is distributed under the GNU General |
| 2 | # Public License version 2 (with a special exception described below). |
| 3 | # |
| 4 | # A copy of GNU General Public License (GPL) is included in this distribution, |
| 5 | # in the file COPYING.GPL. |
| 6 | # |
| 7 | # As a special exception, if other files instantiate templates or use macros |
| 8 | # or inline functions from this file, or you compile this file and link it |
| 9 | # with other works to produce a work based on this file, this file |
| 10 | # does not by itself cause the resulting work to be covered |
| 11 | # by the GNU General Public License. |
| 12 | # |
| 13 | # However the source code for this file must still be made available |
| 14 | # in accordance with section (3) of the GNU General Public License. |
| 15 | # |
| 16 | # This exception does not invalidate any other reasons why a work based |
| 17 | # on this file might be covered by the GNU General Public License. |
| 18 | # |
| 19 | # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com> |
| 20 | |
| 21 | """ |
| 22 | Iterative reading of log files, similar to shell command `tail -f`. |
| 23 | |
| 24 | Copyright: Intra2net AG |
| 25 | |
| 26 | Basic Functionality (class :py:class:`IterativeReader`): |
| 27 | Runs stat in a loop to find out whether file size has changed. Then reads the |
| 28 | new data and forwards that |
| 29 | |
| 30 | .. todo:: Want to also use lsof to find out whether file/pipe/socket was |
| 31 | closed, so can automatically return from read loop. |
| 32 | |
| 33 | :py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns |
| 34 | it line-wise as is normal for log files |
| 35 | |
| 36 | :py:class:`LogParser` takes those lines and tries to parse them into fields |
| 37 | like date, time, module name, urgency and message. |
| 38 | |
| 39 | .. todo:: auto-detect log line layout |
| 40 | """ |
| 41 | import os |
| 42 | import os.path |
| 43 | import re |
| 44 | from warnings import warn |
| 45 | import logging |
| 46 | from contextlib import contextmanager |
| 47 | from .iter_helpers import zip_longest |
| 48 | from .type_helpers import is_str_or_byte, is_file_obj |
| 49 | |
| 50 | |
| 51 | class LogReadWarning(UserWarning): |
| 52 | """Warnings issued by classes in this module.""" |
| 53 | pass |
| 54 | |
| 55 | |
| 56 | def true_func(_): |
| 57 | """Replacement for :py:func:`check_is_used`. Returns `True` always.""" |
| 58 | return True |
| 59 | |
| 60 | |
| 61 | def false_func(_): |
| 62 | """Replacement for :py:func:`check_is_used`. Returns `False` always.""" |
| 63 | return False |
| 64 | |
| 65 | |
| 66 | def check_is_used(file_handle): |
| 67 | """ |
| 68 | Check whether file is being written to. |
| 69 | |
| 70 | To be implemented, e.g. using lsof. |
| 71 | |
| 72 | If beneficial could also easily supply python file object as arg. |
| 73 | |
| 74 | :param int file_handle: OS-level file descriptor |
| 75 | """ |
| 76 | raise NotImplementedError(file_handle) |
| 77 | |
| 78 | |
| 79 | #: counter for unknown sources in :py:func:`create_description` |
| 80 | _create_description_unknown_counter = 0 |
| 81 | |
| 82 | |
| 83 | def create_description(file_obj, file_handle): |
| 84 | """ |
| 85 | Create some description for given file-like object / file descriptor. |
| 86 | |
| 87 | :param file_obj: file-like object |
| 88 | :param int file_handle: os-level file descriptor |
| 89 | :returns: Short description for file-like object |
| 90 | :rtype: string |
| 91 | """ |
| 92 | global _create_description_unknown_counter |
| 93 | |
| 94 | try: |
| 95 | desc = file_obj.name |
| 96 | if desc: |
| 97 | return desc |
| 98 | except AttributeError: |
| 99 | pass |
| 100 | |
| 101 | if file_handle is not None: |
| 102 | return 'file{0}'.format(file_handle) |
| 103 | else: |
| 104 | _create_description_unknown_counter += 1 |
| 105 | return 'unknown{0}'.format(_create_description_unknown_counter) |
| 106 | |
| 107 | |
| 108 | #: error message for IterativeReader constructor |
| 109 | _STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \ |
| 110 | 'files --> use with open(file_name)!' |
| 111 | |
| 112 | |
| 113 | class IterativeReader(object): |
| 114 | """ |
| 115 | Read continuously from a given file. |
| 116 | |
| 117 | Use `os.stat(file_obj.fileno()).st_size` as measure whether file has |
| 118 | changed or not; Always reads as much data as possible. |
| 119 | |
| 120 | Does not care about closing files, so does not accept file names. |
| 121 | |
| 122 | This is the base for class :py:class:`LineReader` that just has to |
| 123 | implement a different :py:meth:`prepare_result` method. |
| 124 | """ |
| 125 | |
| 126 | def __init__(self, sources, descs=None, keep_watching=False): |
| 127 | """ |
| 128 | Create a reader; do some basic checks on args. |
| 129 | |
| 130 | :param sources: iterable over sources. Sources can be opened file |
| 131 | objects or read-opened os-level file descriptors. |
| 132 | Calling code has to ensure they are closed properly, so |
| 133 | best use this within a "with open(file_name) as |
| 134 | file_handle:"-context. If sources is a single file |
| 135 | obj/descriptor, both source and desc will be converted |
| 136 | to lists of length 1 |
| 137 | :param descs: can be anything of same length as sources. If sources is |
| 138 | a single source, then descs is also converted to a list |
| 139 | of length 1. If not given (i.e. None), will use |
| 140 | :py:func:`create_description` to guess descriptions |
| 141 | :param bool keep_watching: keep watching file that is not changing in |
| 142 | size. Need to manually tell whether file |
| 143 | is being written to or not since auto-detect |
| 144 | is not implemented yet. |
| 145 | :raises: OSError when testing fstat on source |
| 146 | """ |
| 147 | if not sources: |
| 148 | raise ValueError('need at least some source!') |
| 149 | elif is_str_or_byte(sources): |
| 150 | raise ValueError(_STR_ERR.format(sources)) |
| 151 | elif is_file_obj(sources) or isinstance(sources, int): |
| 152 | source_input = [sources, ] |
| 153 | desc_input = [descs, ] |
| 154 | else: |
| 155 | source_input = sources # assume some iterable |
| 156 | desc_input = descs |
| 157 | |
| 158 | # now divide sources into os-level file descriptors for os.fstat, |
| 159 | # and file objects for read() |
| 160 | self.file_objs = [] |
| 161 | self.file_handles = [] # file descriptOR, not descriptION |
| 162 | for source in source_input: |
| 163 | if is_file_obj(source): |
| 164 | self.file_objs.append(source) |
| 165 | self.file_handles.append(source.fileno()) |
| 166 | elif isinstance(source, int): |
| 167 | self.file_objs.append(os.fdopen(source)) |
| 168 | self.file_handles.append(source) |
| 169 | elif is_str_or_byte(source): |
| 170 | raise ValueError(_STR_ERR.format(source)) |
| 171 | else: |
| 172 | raise ValueError('source {0} is neither file obj nor file ' |
| 173 | 'descriptor!') |
| 174 | |
| 175 | # try to fstat the new file descriptor just for testing |
| 176 | os.fstat(self.file_handles[-1]) |
| 177 | |
| 178 | # guess descriptions if not given |
| 179 | if not desc_input: |
| 180 | self.descriptions = [create_description(obj, file_handle) |
| 181 | for obj, file_handle |
| 182 | in zip(self.file_objs, self.file_handles)] |
| 183 | else: |
| 184 | try: |
| 185 | if len(desc_input) != len(self.file_objs): |
| 186 | raise ValueError('need same number of sources and ' |
| 187 | 'descriptions!') |
| 188 | except TypeError: |
| 189 | pass # desc_input is generator or so |
| 190 | |
| 191 | self.descriptions = [] |
| 192 | for obj, file_handle, description in \ |
| 193 | zip_longest(self.file_objs, self.file_handles, desc_input): |
| 194 | if obj is None: |
| 195 | raise ValueError('more descriptions than sources!') |
| 196 | elif description is None: |
| 197 | self.descriptions.append(create_description(obj, |
| 198 | file_handle)) |
| 199 | else: |
| 200 | self.descriptions.append(description) |
| 201 | |
| 202 | self.last_sizes = [0 for _ in self.file_objs] |
| 203 | self.ignore = [False for _ in self.file_objs] |
| 204 | |
| 205 | if keep_watching: |
| 206 | self.is_used_func = true_func |
| 207 | else: |
| 208 | self.is_used_func = false_func |
| 209 | # use some day: self.is_used_func = check_is_used |
| 210 | |
| 211 | for obj, file_handle, description in \ |
| 212 | zip(self.file_objs, self.file_handles, self.descriptions): |
| 213 | logging.debug('log_read initialized with file descriptor {0}, ' |
| 214 | 'file obj {1}, description "{2}"' |
| 215 | .format(file_handle, obj, description)) |
| 216 | |
| 217 | def n_sources(self): |
| 218 | """Return number of sources given to constructor.""" |
| 219 | return len(self.file_objs) |
| 220 | |
| 221 | def n_active_sources(self): |
| 222 | """Return number of sources we are actually watching.""" |
| 223 | return len(self.ignore) - sum(self.ignore) |
| 224 | |
| 225 | def __iter__(self): |
| 226 | """ |
| 227 | Continue reading from sources, yield results. |
| 228 | |
| 229 | yields result of :py:meth:`prepare_result`, which depends on what |
| 230 | subclass you called this function from. |
| 231 | """ |
| 232 | while True: |
| 233 | if all(self.ignore): |
| 234 | break |
| 235 | |
| 236 | for idx, (obj, file_handle, description, last_size, do_ignore) in \ |
| 237 | enumerate(zip(self.file_objs, self.file_handles, |
| 238 | self.descriptions, self.last_sizes, |
| 239 | self.ignore)): |
| 240 | if do_ignore: |
| 241 | continue |
| 242 | |
| 243 | # get new file size |
| 244 | new_size = os.fstat(file_handle).st_size |
| 245 | |
| 246 | # compare to old size |
| 247 | if new_size == last_size: |
| 248 | if not self.is_used_func(file_handle): |
| 249 | self.ignore[idx] = True |
| 250 | else: |
| 251 | if new_size < last_size: # happened at start of some tests |
| 252 | warn('{0} / {1} has become smaller ({2} --> {3})! ' |
| 253 | .format(obj, description, last_size, new_size) |
| 254 | + 'Maybe you are reading from a half-initialized ' |
| 255 | + 'file?', |
| 256 | category=LogReadWarning) |
| 257 | try: |
| 258 | new_data = obj.read() |
| 259 | except OSError as ose: # includes IOErrors |
| 260 | warn('io error reading from {0} / {1}: {2})' |
| 261 | .format(obj, description, ose), |
| 262 | category=LogReadWarning) |
| 263 | new_data = str(ose) |
| 264 | except UnicodeDecodeError as ude: |
| 265 | warn('unicode error reading from {0} / {1}: {2}' |
| 266 | .format(obj, description, ude), |
| 267 | category=LogReadWarning) |
| 268 | new_data = str(ude) |
| 269 | |
| 270 | # post-processing |
| 271 | for result in self.prepare_result(description, new_data, idx): |
| 272 | yield result |
| 273 | |
| 274 | # prepare next iteration |
| 275 | self.last_sizes[idx] = new_size |
| 276 | |
| 277 | def prepare_result(self, description, data, idx): |
| 278 | """ |
| 279 | From raw new data create some yield-able results. |
| 280 | |
| 281 | Intended for overwriting in subclasses. |
| 282 | |
| 283 | This function is called from `__iter__` for each new data that becomes |
| 284 | available. It has to provide results which are forwarded to caller. |
| 285 | |
| 286 | This base implementation just yields its input, so new data is yielded |
| 287 | from `__iter__` as-is. |
| 288 | |
| 289 | :param str description: Description of source of lines, one of |
| 290 | :py:data:`self.descriptions` |
| 291 | :param str data: Text data read from source |
| 292 | :param idx: Index of data source |
| 293 | :returns: nothing but yields [(description, data, idx)], same as input |
| 294 | """ |
| 295 | yield description, data, idx |
| 296 | |
| 297 | |
| 298 | #: characters to `rstrip()` from end of complete lines |
| 299 | LINE_SPLITTERS = '\n\r' |
| 300 | |
| 301 | |
| 302 | class LineReader(IterativeReader): |
| 303 | """ |
| 304 | An :py:class:`IterativeReader` that returns new data line-wise. |
| 305 | |
| 306 | This means buffering partial line data. |
| 307 | """ |
| 308 | |
| 309 | def __init__(self, *args, **kwargs): |
| 310 | """Create an :py:class:`IterativeReader and buffers for sources.""" |
| 311 | super(LineReader, self).__init__(*args, **kwargs) |
| 312 | self.line_buffers = ['' for _ in range(self.n_sources())] |
| 313 | |
| 314 | def prepare_result(self, description, new_data, idx): |
| 315 | """ |
| 316 | Take raw new data and split it into lines. |
| 317 | |
| 318 | If line is not complete, then buffer it. |
| 319 | |
| 320 | Args: see super class method :py:meth:`IterativeReader.prepare_result` |
| 321 | |
| 322 | :returns: list of 3-tuples `(description, line, idx)` where |
| 323 | `description` and `idx` are same as args, and `line` is |
| 324 | without trailing newline characters |
| 325 | :rtype: [(str, str, int)] |
| 326 | """ |
| 327 | all_data = self.line_buffers[idx] + new_data |
| 328 | self.line_buffers[idx] = '' |
| 329 | should_be_no_new_lines = False |
| 330 | for line in all_data.splitlines(True): |
| 331 | if line[-1] in LINE_SPLITTERS: |
| 332 | yield description, line.rstrip(LINE_SPLITTERS), idx |
| 333 | elif should_be_no_new_lines: |
| 334 | # self-check |
| 335 | raise ValueError('Programming error: something went wrong with ' |
| 336 | 'line splitting/buffering.') |
| 337 | else: |
| 338 | self.line_buffers[idx] = line |
| 339 | should_be_no_new_lines = True # (this should be the last) |
| 340 | |
| 341 | |
| 342 | class LogParser(LineReader): |
| 343 | """ |
| 344 | Takes lines from :py:class:`LineReader` and parses their contents. |
| 345 | |
| 346 | Requires a pattern for log lines, auto-detection is not implemented yet. |
| 347 | |
| 348 | Iteration returns :py:class:`re.match` result or -- if matching failed -- |
| 349 | None. The latest unparsed line is available as `self.last_unparsed_line`. |
| 350 | Usage recommendation:: |
| 351 | |
| 352 | with open(log_file_name, 'rt') as file_handle: |
| 353 | parser = log_read.LogParser(file_handle, pattern=my_pattern): |
| 354 | for _, data, _ in parser: |
| 355 | if data is None: |
| 356 | print(f'Failed to parse line {parser.last_unparsed_line}') |
| 357 | continue |
| 358 | line_parts = data.groupdict() |
| 359 | ...do stuff with line_parts... |
| 360 | """ |
| 361 | |
| 362 | def __init__(self, log_file, pattern=None): |
| 363 | """ |
| 364 | Create a LogParser. |
| 365 | |
| 366 | :param log_file: source of log lines to parse |
| 367 | :type log_file: see arg `sources` of constructor of :py:class:`IterativeReader` |
| 368 | :param pattern: regexp to parse log lines; None (default) to return |
| 369 | line as they are |
| 370 | :type pattern: str or None (default) |
| 371 | """ |
| 372 | super(LogParser, self).__init__(log_file) |
| 373 | |
| 374 | self.pattern = pattern |
| 375 | self.last_unparsed_line = '' |
| 376 | |
| 377 | def prepare_result(self, *args): |
| 378 | """ |
| 379 | Try parsing lines. |
| 380 | |
| 381 | Args: see super class method :py:meth:`IterativeReader.prepare_result` |
| 382 | |
| 383 | :returns: 3-tuples `(description, line, idx)` where `description` and |
| 384 | `idx` are same as input args and `line` is either a |
| 385 | :py:class:`re.Match` if line matched :py:data:`self.pattern` |
| 386 | or just str if line did not match. |
| 387 | :rtype: [(str, :py:class:`re.Match` OR str, int)] |
| 388 | """ |
| 389 | # let super class split data into lines |
| 390 | for description, raw_line, idx in \ |
| 391 | super(LogParser, self).prepare_result(*args): |
| 392 | matches = re.match(self.pattern, raw_line) |
| 393 | if matches: |
| 394 | yield description, matches, idx |
| 395 | else: |
| 396 | self.last_unparsed_line = raw_line |
| 397 | yield description, None, idx |
| 398 | |
| 399 | @classmethod |
| 400 | @contextmanager |
| 401 | def create_for(cls, filename, *args, **kwargs): |
| 402 | """ |
| 403 | Open single file, yield LogParser. Ensures file is closed afterwards. |
| 404 | |
| 405 | This allows opening file and creation LogParser for it to one line:: |
| 406 | |
| 407 | with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser: |
| 408 | for _, matches, _ in parser: |
| 409 | try: |
| 410 | print(matches.groupdict()) |
| 411 | except Exception: |
| 412 | print(f'UNPARSED: {parser.last_unparsed_line}') |
| 413 | |
| 414 | :param str filename: something that :py:meth:`open` accepts |
| 415 | :param args: Forwarded to constructor |
| 416 | :param kwargs: Forwarded to constructor |
| 417 | """ |
| 418 | with open(filename) as file_handle: |
| 419 | yield cls(file_handle, *args, **kwargs) |
| 420 | |
| 421 | |
| 422 | ################################################################################ |
| 423 | # PATTERNS FOR FREQUENT LOG FILES |
| 424 | ################################################################################ |
| 425 | |
| 426 | # pattern of squid proxy logs. group names are best guesses |
| 427 | PROXY_LOG_PATTERN = \ |
| 428 | r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \ |
| 429 | + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \ |
| 430 | + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \ |
| 431 | + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*' |
| 432 | |
| 433 | # pattern for linux system logs (usually "messages" or "syslog" also "maillog" |
| 434 | SYS_LOG_PATTERN = \ |
| 435 | r'\s*(?P<timestamp>\w{3} +\d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \ |
| 436 | + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)' |