Adjust SYS_LOG_PATTERN to parse single digit day number
[pyi2ncommon] / src / log_read.py
CommitLineData
3237d2a6
CH
1# The software in this package is distributed under the GNU General
2# Public License version 2 (with a special exception described below).
3#
4# A copy of GNU General Public License (GPL) is included in this distribution,
5# in the file COPYING.GPL.
6#
7# As a special exception, if other files instantiate templates or use macros
8# or inline functions from this file, or you compile this file and link it
9# with other works to produce a work based on this file, this file
10# does not by itself cause the resulting work to be covered
11# by the GNU General Public License.
12#
13# However the source code for this file must still be made available
14# in accordance with section (3) of the GNU General Public License.
15#
16# This exception does not invalidate any other reasons why a work based
17# on this file might be covered by the GNU General Public License.
f365f614
CH
18#
19# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
3237d2a6 20
e2e13005 21"""
e2e13005
CH
22Iterative reading of log files, similar to shell command `tail -f`.
23
24Copyright: Intra2net AG
25
7c362208 26Basic Functionality (class :py:class:`IterativeReader`):
e7d49180
CH
27Runs stat in a loop to find out whether file size has changed. Then reads the
28new data and forwards that
29
e2e13005 30.. todo:: Want to also use lsof to find out whether file/pipe/socket was
545f3458 31 closed, so can automatically return from read loop.
e7d49180
CH
32
33:py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
34it line-wise as is normal for log files
35
36:py:class:`LogParser` takes those lines and tries to parse them into fields
37like date, time, module name, urgency and message.
38
2713b352 39.. todo:: auto-detect log line layout
e7d49180 40"""
e7d49180 41import os
215a12fd 42import os.path
445afb23 43import re
e7d49180 44from warnings import warn
77a07d09 45import logging
a51235e2 46from contextlib import contextmanager
1242b1cf 47from .iter_helpers import zip_longest
1242b1cf 48from .type_helpers import is_str_or_byte, is_file_obj
e7d49180
CH
49
50
51class LogReadWarning(UserWarning):
e2e13005 52 """Warnings issued by classes in this module."""
e7d49180
CH
53 pass
54
55
e2e13005
CH
56def true_func(_):
57 """Replacement for :py:func:`check_is_used`. Returns `True` always."""
e7d49180
CH
58 return True
59
60
545f3458
CH
61def false_func(_):
62 """Replacement for :py:func:`check_is_used`. Returns `False` always."""
63 return False
64
65
e2e13005
CH
66def check_is_used(file_handle):
67 """
68 Check whether file is being written to.
69
70 To be implemented, e.g. using lsof.
e7d49180 71
e2e13005
CH
72 If beneficial could also easily supply python file object as arg.
73
74 :param int file_handle: OS-level file descriptor
e7d49180 75 """
e2e13005 76 raise NotImplementedError(file_handle)
e7d49180
CH
77
78
e2e13005 79#: counter for unknown sources in :py:func:`create_description`
e7d49180
CH
80_create_description_unknown_counter = 0
81
acc472a4 82
3e0b3965 83def create_description(file_obj, file_handle):
e2e13005
CH
84 """
85 Create some description for given file-like object / file descriptor.
e7d49180
CH
86
87 :param file_obj: file-like object
3e0b3965 88 :param int file_handle: os-level file descriptor
e2e13005
CH
89 :returns: Short description for file-like object
90 :rtype: string
e7d49180 91 """
e7d49180
CH
92 global _create_description_unknown_counter
93
94 try:
95 desc = file_obj.name
96 if desc:
97 return desc
98 except AttributeError:
99 pass
100
3e0b3965
CH
101 if file_handle is not None:
102 return 'file{0}'.format(file_handle)
e7d49180
CH
103 else:
104 _create_description_unknown_counter += 1
105 return 'unknown{0}'.format(_create_description_unknown_counter)
106
107
108#: error message for IterativeReader constructor
109_STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
110 'files --> use with open(file_name)!'
111
112
879f0150 113class IterativeReader(object):
e2e13005
CH
114 """
115 Read continuously from a given file.
e7d49180 116
e2e13005
CH
117 Use `os.stat(file_obj.fileno()).st_size` as measure whether file has
118 changed or not; Always reads as much data as possible.
e7d49180 119
e2e13005 120 Does not care about closing files, so does not accept file names.
e7d49180
CH
121
122 This is the base for class :py:class:`LineReader` that just has to
e2e13005 123 implement a different :py:meth:`prepare_result` method.
e7d49180
CH
124 """
125
545f3458 126 def __init__(self, sources, descs=None, keep_watching=False):
e2e13005
CH
127 """
128 Create a reader; do some basic checks on args.
e7d49180
CH
129
130 :param sources: iterable over sources. Sources can be opened file
131 objects or read-opened os-level file descriptors.
132 Calling code has to ensure they are closed properly, so
133 best use this within a "with open(file_name) as
134 file_handle:"-context. If sources is a single file
135 obj/descriptor, both source and desc will be converted
136 to lists of length 1
137 :param descs: can be anything of same length as sources. If sources is
138 a single source, then descs is also converted to a list
139 of length 1. If not given (i.e. None), will use
140 :py:func:`create_description` to guess descriptions
545f3458
CH
141 :param bool keep_watching: keep watching file that is not changing in
142 size. Need to manually tell whether file
143 is being written to or not since auto-detect
144 is not implemented yet.
e7d49180
CH
145 :raises: OSError when testing fstat on source
146 """
147 if not sources:
148 raise ValueError('need at least some source!')
149 elif is_str_or_byte(sources):
150 raise ValueError(_STR_ERR.format(sources))
151 elif is_file_obj(sources) or isinstance(sources, int):
152 source_input = [sources, ]
153 desc_input = [descs, ]
154 else:
155 source_input = sources # assume some iterable
156 desc_input = descs
157
158 # now divide sources into os-level file descriptors for os.fstat,
159 # and file objects for read()
160 self.file_objs = []
3e0b3965 161 self.file_handles = [] # file descriptOR, not descriptION
e7d49180
CH
162 for source in source_input:
163 if is_file_obj(source):
164 self.file_objs.append(source)
3e0b3965 165 self.file_handles.append(source.fileno())
e7d49180
CH
166 elif isinstance(source, int):
167 self.file_objs.append(os.fdopen(source))
3e0b3965 168 self.file_handles.append(source)
e7d49180
CH
169 elif is_str_or_byte(source):
170 raise ValueError(_STR_ERR.format(source))
171 else:
172 raise ValueError('source {0} is neither file obj nor file '
173 'descriptor!')
174
d910eba5 175 # try to fstat the new file descriptor just for testing
3e0b3965 176 os.fstat(self.file_handles[-1])
e7d49180
CH
177
178 # guess descriptions if not given
179 if not desc_input:
3e0b3965
CH
180 self.descriptions = [create_description(obj, file_handle)
181 for obj, file_handle
182 in zip(self.file_objs, self.file_handles)]
e7d49180
CH
183 else:
184 try:
185 if len(desc_input) != len(self.file_objs):
186 raise ValueError('need same number of sources and '
187 'descriptions!')
188 except TypeError:
189 pass # desc_input is generator or so
190
191 self.descriptions = []
3e0b3965
CH
192 for obj, file_handle, description in \
193 zip_longest(self.file_objs, self.file_handles, desc_input):
e7d49180
CH
194 if obj is None:
195 raise ValueError('more descriptions than sources!')
196 elif description is None:
197 self.descriptions.append(create_description(obj,
3e0b3965 198 file_handle))
e7d49180
CH
199 else:
200 self.descriptions.append(description)
201
202 self.last_sizes = [0 for _ in self.file_objs]
203 self.ignore = [False for _ in self.file_objs]
204
545f3458 205 if keep_watching:
3e0b3965 206 self.is_used_func = true_func
545f3458
CH
207 else:
208 self.is_used_func = false_func
209 # use some day: self.is_used_func = check_is_used
e7d49180 210
3e0b3965
CH
211 for obj, file_handle, description in \
212 zip(self.file_objs, self.file_handles, self.descriptions):
77a07d09
CH
213 logging.debug('log_read initialized with file descriptor {0}, '
214 'file obj {1}, description "{2}"'
3e0b3965 215 .format(file_handle, obj, description))
e7d49180
CH
216
217 def n_sources(self):
e2e13005 218 """Return number of sources given to constructor."""
e7d49180
CH
219 return len(self.file_objs)
220
221 def n_active_sources(self):
e2e13005 222 """Return number of sources we are actually watching."""
e7d49180
CH
223 return len(self.ignore) - sum(self.ignore)
224
225 def __iter__(self):
e2e13005
CH
226 """
227 Continue reading from sources, yield results.
228
44353454
CH
229 yields result of :py:meth:`prepare_result`, which depends on what
230 subclass you called this function from.
e2e13005 231 """
e7d49180 232 while True:
545f3458
CH
233 if all(self.ignore):
234 break
235
3e0b3965
CH
236 for idx, (obj, file_handle, description, last_size, do_ignore) in \
237 enumerate(zip(self.file_objs, self.file_handles,
e7d49180
CH
238 self.descriptions, self.last_sizes,
239 self.ignore)):
e7d49180
CH
240 if do_ignore:
241 continue
242
243 # get new file size
3e0b3965 244 new_size = os.fstat(file_handle).st_size
e7d49180
CH
245
246 # compare to old size
247 if new_size == last_size:
3e0b3965 248 if not self.is_used_func(file_handle):
acc472a4 249 self.ignore[idx] = True
9f2fbfa7 250 else:
ea8b01a3
CH
251 if new_size < last_size: # happened at start of some tests
252 warn('{0} / {1} has become smaller ({2} --> {3})! '
253 .format(obj, description, last_size, new_size)
254 + 'Maybe you are reading from a half-initialized '
255 + 'file?',
9f2fbfa7 256 category=LogReadWarning)
e7d49180
CH
257 try:
258 new_data = obj.read()
259 except OSError as ose: # includes IOErrors
260 warn('io error reading from {0} / {1}: {2})'
261 .format(obj, description, ose),
262 category=LogReadWarning)
9451b2ce
CH
263 new_data = str(ose)
264 except UnicodeDecodeError as ude:
265 warn('unicode error reading from {0} / {1}: {2}'
266 .format(obj, description, ude),
267 category=LogReadWarning)
268 new_data = str(ude)
e7d49180
CH
269
270 # post-processing
44353454 271 for result in self.prepare_result(description, new_data, idx):
e7d49180
CH
272 yield result
273
274 # prepare next iteration
275 self.last_sizes[idx] = new_size
276
277 def prepare_result(self, description, data, idx):
e2e13005
CH
278 """
279 From raw new data create some yield-able results.
e7d49180 280
7628bc48 281 Intended for overwriting in subclasses.
e7d49180 282
df036fbe 283 This function is called from `__iter__` for each new data that becomes
44353454 284 available. It has to provide results which are forwarded to caller.
e7d49180 285
44353454
CH
286 This base implementation just yields its input, so new data is yielded
287 from `__iter__` as-is.
e2e13005
CH
288
289 :param str description: Description of source of lines, one of
290 :py:data:`self.descriptions`
7628bc48 291 :param str data: Text data read from source
e2e13005 292 :param idx: Index of data source
df036fbe 293 :returns: nothing but yields [(description, data, idx)], same as input
e7d49180 294 """
44353454 295 yield description, data, idx
e7d49180
CH
296
297
e2e13005 298#: characters to `rstrip()` from end of complete lines
e7d49180
CH
299LINE_SPLITTERS = '\n\r'
300
acc472a4 301
e7d49180 302class LineReader(IterativeReader):
e2e13005
CH
303 """
304 An :py:class:`IterativeReader` that returns new data line-wise.
edd68a74 305
e2e13005 306 This means buffering partial line data.
e7d49180
CH
307 """
308
309 def __init__(self, *args, **kwargs):
e2e13005 310 """Create an :py:class:`IterativeReader and buffers for sources."""
01fe1580 311 super(LineReader, self).__init__(*args, **kwargs)
d910eba5 312 self.line_buffers = ['' for _ in range(self.n_sources())]
e7d49180 313
d910eba5 314 def prepare_result(self, description, new_data, idx):
e2e13005
CH
315 """
316 Take raw new data and split it into lines.
e7d49180 317
e2e13005 318 If line is not complete, then buffer it.
e7d49180 319
e2e13005 320 Args: see super class method :py:meth:`IterativeReader.prepare_result`
df036fbe 321
aef38def
CH
322 :returns: list of 3-tuples `(description, line, idx)` where
323 `description` and `idx` are same as args, and `line` is
e2e13005 324 without trailing newline characters
aef38def 325 :rtype: [(str, str, int)]
e7d49180 326 """
e7d49180 327 all_data = self.line_buffers[idx] + new_data
d910eba5 328 self.line_buffers[idx] = ''
e7d49180 329 should_be_no_new_lines = False
879f0150 330 for line in all_data.splitlines(True):
e7d49180 331 if line[-1] in LINE_SPLITTERS:
44353454 332 yield description, line.rstrip(LINE_SPLITTERS), idx
e7d49180 333 elif should_be_no_new_lines:
37288ebb
CH
334 # self-check
335 raise ValueError('Programming error: something went wrong with '
336 'line splitting/buffering.')
e7d49180
CH
337 else:
338 self.line_buffers[idx] = line
339 should_be_no_new_lines = True # (this should be the last)
340
e7d49180 341
445afb23 342class LogParser(LineReader):
e2e13005
CH
343 """
344 Takes lines from :py:class:`LineReader` and parses their contents.
445afb23 345
e2e13005 346 Requires a pattern for log lines, auto-detection is not implemented yet.
445afb23 347
545f3458 348 Iteration returns :py:class:`re.match` result or -- if matching failed --
44353454 349 None. The latest unparsed line is available as `self.last_unparsed_line`.
df036fbe 350 Usage recommendation::
545f3458
CH
351
352 with open(log_file_name, 'rt') as file_handle:
44353454
CH
353 parser = log_read.LogParser(file_handle, pattern=my_pattern):
354 for _, data, _ in parser:
355 if data is None:
356 print(f'Failed to parse line {parser.last_unparsed_line}')
545f3458 357 continue
44353454 358 line_parts = data.groupdict()
545f3458 359 ...do stuff with line_parts...
445afb23
CH
360 """
361
362 def __init__(self, log_file, pattern=None):
e2e13005
CH
363 """
364 Create a LogParser.
445afb23
CH
365
366 :param str log_file: name of log file to parse (required!)
367 :param pattern: regexp to split log lines; None (default) to return
368 line as they are
e2e13005 369 :type pattern: str or None (default)
445afb23
CH
370 """
371 super(LogParser, self).__init__(log_file)
372
373 self.pattern = pattern
44353454 374 self.last_unparsed_line = ''
445afb23
CH
375
376 def prepare_result(self, *args):
e2e13005
CH
377 """
378 Try parsing lines.
379
380 Args: see super class method :py:meth:`IterativeReader.prepare_result`
df036fbe 381
aef38def
CH
382 :returns: 3-tuples `(description, line, idx)` where `description` and
383 `idx` are same as input args and `line` is either a
384 :py:class:`re.Match` if line matched :py:data:`self.pattern`
385 or just str if line did not match.
386 :rtype: [(str, :py:class:`re.Match` OR str, int)]
e2e13005 387 """
445afb23 388 # let super class split data into lines
aef38def
CH
389 for description, raw_line, idx in \
390 super(LogParser, self).prepare_result(*args):
1a537f05
CH
391 matches = re.match(self.pattern, raw_line)
392 if matches:
44353454 393 yield description, matches, idx
445afb23 394 else:
44353454
CH
395 self.last_unparsed_line = raw_line
396 yield description, None, idx
a51235e2
CH
397
398 @classmethod
399 @contextmanager
400 def create_for(cls, filename, *args, **kwargs):
401 """
402 Open single file, yield LogParser. Ensures file is closed afterwards.
403
df036fbe 404 This allows opening file and creation LogParser for it to one line::
a51235e2
CH
405
406 with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser:
407 for _, matches, _ in parser:
408 try:
409 print(matches.groupdict())
410 except Exception:
411 print(f'UNPARSED: {parser.last_unparsed_line}')
412
413 :param str filename: something that :py:meth:`open` accepts
414 :param args: Forwarded to constructor
415 :param kwargs: Forwarded to constructor
416 """
417 with open(filename) as file_handle:
418 yield cls(file_handle, *args, **kwargs)
0dd2fb19
CH
419
420
421################################################################################
422# PATTERNS FOR FREQUENT LOG FILES
423################################################################################
424
425# pattern of squid proxy logs. group names are best guesses
426PROXY_LOG_PATTERN = \
427 r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \
428 + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \
429 + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \
430 + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*'
431
432# pattern for linux system logs (usually "messages" or "syslog" also "maillog"
433SYS_LOG_PATTERN = \
802fdc29 434 r'\s*(?P<timestamp>\w{3} +\d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \
0dd2fb19 435 + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)'