Return matches or None from LogParser iteration
[pyi2ncommon] / src / log_read.py
CommitLineData
3237d2a6
CH
1# The software in this package is distributed under the GNU General
2# Public License version 2 (with a special exception described below).
3#
4# A copy of GNU General Public License (GPL) is included in this distribution,
5# in the file COPYING.GPL.
6#
7# As a special exception, if other files instantiate templates or use macros
8# or inline functions from this file, or you compile this file and link it
9# with other works to produce a work based on this file, this file
10# does not by itself cause the resulting work to be covered
11# by the GNU General Public License.
12#
13# However the source code for this file must still be made available
14# in accordance with section (3) of the GNU General Public License.
15#
16# This exception does not invalidate any other reasons why a work based
17# on this file might be covered by the GNU General Public License.
f365f614
CH
18#
19# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
3237d2a6 20
e2e13005
CH
21"""
22
23SUMMARY
24------------------------------------------------------
25Iterative reading of log files, similar to shell command `tail -f`.
26
27Copyright: Intra2net AG
28
29
30CONTENTS
31------------------------------------------------------
e7d49180 32
7c362208 33Basic Functionality (class :py:class:`IterativeReader`):
e7d49180
CH
34Runs stat in a loop to find out whether file size has changed. Then reads the
35new data and forwards that
36
e2e13005 37.. todo:: Want to also use lsof to find out whether file/pipe/socket was
545f3458 38 closed, so can automatically return from read loop.
e7d49180
CH
39
40:py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
41it line-wise as is normal for log files
42
43:py:class:`LogParser` takes those lines and tries to parse them into fields
44like date, time, module name, urgency and message.
45
2713b352 46.. todo:: auto-detect log line layout
e2e13005
CH
47
48
49INTERFACE
50------------------------------------------------------
51
e7d49180
CH
52"""
53
54import os
215a12fd 55import os.path
445afb23 56import re
e7d49180 57from warnings import warn
77a07d09 58import logging
1242b1cf 59from .iter_helpers import zip_longest
1242b1cf 60from .type_helpers import is_str_or_byte, is_file_obj
e7d49180
CH
61
62
63class LogReadWarning(UserWarning):
e2e13005 64 """Warnings issued by classes in this module."""
e7d49180
CH
65 pass
66
67
e2e13005
CH
68def true_func(_):
69 """Replacement for :py:func:`check_is_used`. Returns `True` always."""
e7d49180
CH
70 return True
71
72
545f3458
CH
73def false_func(_):
74 """Replacement for :py:func:`check_is_used`. Returns `False` always."""
75 return False
76
77
e2e13005
CH
78def check_is_used(file_handle):
79 """
80 Check whether file is being written to.
81
82 To be implemented, e.g. using lsof.
e7d49180 83
e2e13005
CH
84 If beneficial could also easily supply python file object as arg.
85
86 :param int file_handle: OS-level file descriptor
e7d49180 87 """
e2e13005 88 raise NotImplementedError(file_handle)
e7d49180
CH
89
90
e2e13005 91#: counter for unknown sources in :py:func:`create_description`
e7d49180
CH
92_create_description_unknown_counter = 0
93
acc472a4 94
3e0b3965 95def create_description(file_obj, file_handle):
e2e13005
CH
96 """
97 Create some description for given file-like object / file descriptor.
e7d49180
CH
98
99 :param file_obj: file-like object
3e0b3965 100 :param int file_handle: os-level file descriptor
e2e13005
CH
101 :returns: Short description for file-like object
102 :rtype: string
e7d49180 103 """
e7d49180
CH
104 global _create_description_unknown_counter
105
106 try:
107 desc = file_obj.name
108 if desc:
109 return desc
110 except AttributeError:
111 pass
112
3e0b3965
CH
113 if file_handle is not None:
114 return 'file{0}'.format(file_handle)
e7d49180
CH
115 else:
116 _create_description_unknown_counter += 1
117 return 'unknown{0}'.format(_create_description_unknown_counter)
118
119
120#: error message for IterativeReader constructor
121_STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
122 'files --> use with open(file_name)!'
123
124
879f0150 125class IterativeReader(object):
e2e13005
CH
126 """
127 Read continuously from a given file.
e7d49180 128
e2e13005
CH
129 Use `os.stat(file_obj.fileno()).st_size` as measure whether file has
130 changed or not; Always reads as much data as possible.
e7d49180 131
e2e13005 132 Does not care about closing files, so does not accept file names.
e7d49180
CH
133
134 This is the base for class :py:class:`LineReader` that just has to
e2e13005 135 implement a different :py:meth:`prepare_result` method.
e7d49180
CH
136 """
137
545f3458 138 def __init__(self, sources, descs=None, keep_watching=False):
e2e13005
CH
139 """
140 Create a reader; do some basic checks on args.
e7d49180
CH
141
142 :param sources: iterable over sources. Sources can be opened file
143 objects or read-opened os-level file descriptors.
144 Calling code has to ensure they are closed properly, so
145 best use this within a "with open(file_name) as
146 file_handle:"-context. If sources is a single file
147 obj/descriptor, both source and desc will be converted
148 to lists of length 1
149 :param descs: can be anything of same length as sources. If sources is
150 a single source, then descs is also converted to a list
151 of length 1. If not given (i.e. None), will use
152 :py:func:`create_description` to guess descriptions
545f3458
CH
153 :param bool keep_watching: keep watching file that is not changing in
154 size. Need to manually tell whether file
155 is being written to or not since auto-detect
156 is not implemented yet.
e7d49180
CH
157 :raises: OSError when testing fstat on source
158 """
159 if not sources:
160 raise ValueError('need at least some source!')
161 elif is_str_or_byte(sources):
162 raise ValueError(_STR_ERR.format(sources))
163 elif is_file_obj(sources) or isinstance(sources, int):
164 source_input = [sources, ]
165 desc_input = [descs, ]
166 else:
167 source_input = sources # assume some iterable
168 desc_input = descs
169
170 # now divide sources into os-level file descriptors for os.fstat,
171 # and file objects for read()
172 self.file_objs = []
3e0b3965 173 self.file_handles = [] # file descriptOR, not descriptION
e7d49180
CH
174 for source in source_input:
175 if is_file_obj(source):
176 self.file_objs.append(source)
3e0b3965 177 self.file_handles.append(source.fileno())
e7d49180
CH
178 elif isinstance(source, int):
179 self.file_objs.append(os.fdopen(source))
3e0b3965 180 self.file_handles.append(source)
e7d49180
CH
181 elif is_str_or_byte(source):
182 raise ValueError(_STR_ERR.format(source))
183 else:
184 raise ValueError('source {0} is neither file obj nor file '
185 'descriptor!')
186
d910eba5 187 # try to fstat the new file descriptor just for testing
3e0b3965 188 os.fstat(self.file_handles[-1])
e7d49180
CH
189
190 # guess descriptions if not given
191 if not desc_input:
3e0b3965
CH
192 self.descriptions = [create_description(obj, file_handle)
193 for obj, file_handle
194 in zip(self.file_objs, self.file_handles)]
e7d49180
CH
195 else:
196 try:
197 if len(desc_input) != len(self.file_objs):
198 raise ValueError('need same number of sources and '
199 'descriptions!')
200 except TypeError:
201 pass # desc_input is generator or so
202
203 self.descriptions = []
3e0b3965
CH
204 for obj, file_handle, description in \
205 zip_longest(self.file_objs, self.file_handles, desc_input):
e7d49180
CH
206 if obj is None:
207 raise ValueError('more descriptions than sources!')
208 elif description is None:
209 self.descriptions.append(create_description(obj,
3e0b3965 210 file_handle))
e7d49180
CH
211 else:
212 self.descriptions.append(description)
213
214 self.last_sizes = [0 for _ in self.file_objs]
215 self.ignore = [False for _ in self.file_objs]
216
545f3458 217 if keep_watching:
3e0b3965 218 self.is_used_func = true_func
545f3458
CH
219 else:
220 self.is_used_func = false_func
221 # use some day: self.is_used_func = check_is_used
e7d49180 222
3e0b3965
CH
223 for obj, file_handle, description in \
224 zip(self.file_objs, self.file_handles, self.descriptions):
77a07d09
CH
225 logging.debug('log_read initialized with file descriptor {0}, '
226 'file obj {1}, description "{2}"'
3e0b3965 227 .format(file_handle, obj, description))
e7d49180
CH
228
229 def n_sources(self):
e2e13005 230 """Return number of sources given to constructor."""
e7d49180
CH
231 return len(self.file_objs)
232
233 def n_active_sources(self):
e2e13005 234 """Return number of sources we are actually watching."""
e7d49180
CH
235 return len(self.ignore) - sum(self.ignore)
236
237 def __iter__(self):
e2e13005
CH
238 """
239 Continue reading from sources, yield results.
240
44353454
CH
241 yields result of :py:meth:`prepare_result`, which depends on what
242 subclass you called this function from.
e2e13005 243 """
e7d49180 244 while True:
545f3458
CH
245 if all(self.ignore):
246 break
247
3e0b3965
CH
248 for idx, (obj, file_handle, description, last_size, do_ignore) in \
249 enumerate(zip(self.file_objs, self.file_handles,
e7d49180
CH
250 self.descriptions, self.last_sizes,
251 self.ignore)):
e7d49180
CH
252 if do_ignore:
253 continue
254
255 # get new file size
3e0b3965 256 new_size = os.fstat(file_handle).st_size
e7d49180
CH
257
258 # compare to old size
259 if new_size == last_size:
3e0b3965 260 if not self.is_used_func(file_handle):
acc472a4 261 self.ignore[idx] = True
9f2fbfa7 262 else:
ea8b01a3
CH
263 if new_size < last_size: # happened at start of some tests
264 warn('{0} / {1} has become smaller ({2} --> {3})! '
265 .format(obj, description, last_size, new_size)
266 + 'Maybe you are reading from a half-initialized '
267 + 'file?',
9f2fbfa7 268 category=LogReadWarning)
e7d49180
CH
269 try:
270 new_data = obj.read()
271 except OSError as ose: # includes IOErrors
272 warn('io error reading from {0} / {1}: {2})'
273 .format(obj, description, ose),
274 category=LogReadWarning)
9451b2ce
CH
275 new_data = str(ose)
276 except UnicodeDecodeError as ude:
277 warn('unicode error reading from {0} / {1}: {2}'
278 .format(obj, description, ude),
279 category=LogReadWarning)
280 new_data = str(ude)
e7d49180
CH
281
282 # post-processing
44353454 283 for result in self.prepare_result(description, new_data, idx):
e7d49180
CH
284 yield result
285
286 # prepare next iteration
287 self.last_sizes[idx] = new_size
288
289 def prepare_result(self, description, data, idx):
e2e13005
CH
290 """
291 From raw new data create some yield-able results.
e7d49180 292
7628bc48 293 Intended for overwriting in subclasses.
e7d49180 294
e2e13005 295 This function is called from __iter__ for each new data that becomes
44353454 296 available. It has to provide results which are forwarded to caller.
e7d49180 297
44353454
CH
298 This base implementation just yields its input, so new data is yielded
299 from `__iter__` as-is.
e2e13005
CH
300
301 :param str description: Description of source of lines, one of
302 :py:data:`self.descriptions`
7628bc48 303 :param str data: Text data read from source
e2e13005 304 :param idx: Index of data source
44353454 305 :returns: nothing but yields [(description, data, idx], same as input
e7d49180 306 """
44353454 307 yield description, data, idx
e7d49180
CH
308
309
e2e13005 310#: characters to `rstrip()` from end of complete lines
e7d49180
CH
311LINE_SPLITTERS = '\n\r'
312
acc472a4 313
e7d49180 314class LineReader(IterativeReader):
e2e13005
CH
315 """
316 An :py:class:`IterativeReader` that returns new data line-wise.
edd68a74 317
e2e13005 318 This means buffering partial line data.
e7d49180
CH
319 """
320
321 def __init__(self, *args, **kwargs):
e2e13005 322 """Create an :py:class:`IterativeReader and buffers for sources."""
01fe1580 323 super(LineReader, self).__init__(*args, **kwargs)
d910eba5 324 self.line_buffers = ['' for _ in range(self.n_sources())]
e7d49180 325
d910eba5 326 def prepare_result(self, description, new_data, idx):
e2e13005
CH
327 """
328 Take raw new data and split it into lines.
e7d49180 329
e2e13005 330 If line is not complete, then buffer it.
e7d49180 331
e2e13005 332 Args: see super class method :py:meth:`IterativeReader.prepare_result`
aef38def
CH
333 :returns: list of 3-tuples `(description, line, idx)` where
334 `description` and `idx` are same as args, and `line` is
e2e13005 335 without trailing newline characters
aef38def 336 :rtype: [(str, str, int)]
e7d49180 337 """
e7d49180 338 all_data = self.line_buffers[idx] + new_data
d910eba5 339 self.line_buffers[idx] = ''
e7d49180 340 should_be_no_new_lines = False
879f0150 341 for line in all_data.splitlines(True):
e7d49180 342 if line[-1] in LINE_SPLITTERS:
44353454 343 yield description, line.rstrip(LINE_SPLITTERS), idx
e7d49180 344 elif should_be_no_new_lines:
37288ebb
CH
345 # self-check
346 raise ValueError('Programming error: something went wrong with '
347 'line splitting/buffering.')
e7d49180
CH
348 else:
349 self.line_buffers[idx] = line
350 should_be_no_new_lines = True # (this should be the last)
351
e7d49180 352
445afb23 353class LogParser(LineReader):
e2e13005
CH
354 """
355 Takes lines from :py:class:`LineReader` and parses their contents.
445afb23 356
e2e13005 357 Requires a pattern for log lines, auto-detection is not implemented yet.
445afb23 358
545f3458 359 Iteration returns :py:class:`re.match` result or -- if matching failed --
44353454
CH
360 None. The latest unparsed line is available as `self.last_unparsed_line`.
361 Usage recommendation:
545f3458
CH
362
363 with open(log_file_name, 'rt') as file_handle:
44353454
CH
364 parser = log_read.LogParser(file_handle, pattern=my_pattern):
365 for _, data, _ in parser:
366 if data is None:
367 print(f'Failed to parse line {parser.last_unparsed_line}')
545f3458 368 continue
44353454 369 line_parts = data.groupdict()
545f3458 370 ...do stuff with line_parts...
445afb23
CH
371 """
372
373 def __init__(self, log_file, pattern=None):
e2e13005
CH
374 """
375 Create a LogParser.
445afb23
CH
376
377 :param str log_file: name of log file to parse (required!)
378 :param pattern: regexp to split log lines; None (default) to return
379 line as they are
e2e13005 380 :type pattern: str or None (default)
445afb23
CH
381 """
382 super(LogParser, self).__init__(log_file)
383
384 self.pattern = pattern
44353454 385 self.last_unparsed_line = ''
445afb23
CH
386
387 def prepare_result(self, *args):
e2e13005
CH
388 """
389 Try parsing lines.
390
391 Args: see super class method :py:meth:`IterativeReader.prepare_result`
aef38def
CH
392 :returns: 3-tuples `(description, line, idx)` where `description` and
393 `idx` are same as input args and `line` is either a
394 :py:class:`re.Match` if line matched :py:data:`self.pattern`
395 or just str if line did not match.
396 :rtype: [(str, :py:class:`re.Match` OR str, int)]
e2e13005 397 """
445afb23 398 # let super class split data into lines
aef38def
CH
399 for description, raw_line, idx in \
400 super(LogParser, self).prepare_result(*args):
1a537f05
CH
401 matches = re.match(self.pattern, raw_line)
402 if matches:
44353454 403 yield description, matches, idx
445afb23 404 else:
44353454
CH
405 self.last_unparsed_line = raw_line
406 yield description, None, idx