forked from vectordotdev/vrl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlog_util.rs
259 lines (247 loc) · 16.1 KB
/
log_util.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
use std::collections::BTreeMap;
use crate::compiler::TimeZone;
use crate::value::Value;
use chrono::prelude::{DateTime, Utc};
use once_cell::sync::Lazy;
use regex::{Captures, Regex};
// Information about the common log format taken from the
// - W3C specification: https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format
// - Apache HTTP Server docs: https://httpd.apache.org/docs/1.3/logs.html#common
pub(crate) static REGEX_APACHE_COMMON_LOG: Lazy<Vec<Regex>> = Lazy::new(|| {
vec![
Regex::new(
r#"(?x) # Ignore whitespace and comments in the regex expression.
^\s* # Start with any number of whitespaces.
(-|(?P<host>.*?))\s+ # Match `-` or any character (non-greedily) and at least one whitespace.
(-|(?P<identity>.*?))\s+ # Match `-` or any character (non-greedily) and at least one whitespace.
(-|(?P<user>.*?))\s+ # Match `-` or any character (non-greedily) and at least one whitespace.
(-|\[(-|(?P<timestamp>[^\[]*))\])\s+ # Match `-` or `[` followed by `-` or any character except `]`, `]` and at least one whitespace.
(-|"(-|(\s* # Match `-` or `"` followed by `-` or and any number of whitespaces...
(?P<message>( # Match a request with...
(?P<method>\w+)\s+ # Match at least one word character and at least one whitespace.
(?P<path>[[\\"][^"]]*?)\s+ # Match any character except `"`, but `\"` (non-greedily) and at least one whitespace.
(?P<protocol>[[\\"][^"]]*?)\s* # Match any character except `"`, but `\"` (non-greedily) and any number of whitespaces.
|[[\\"][^"]]*?))\s*))" # ...Or match any character except `"`, but `\"`, and any amount of whitespaces.
)\s+ # Match at least one whitespace.
(-|(?P<status>\d+))\s+ # Match `-` or at least one digit and at least one whitespace.
(-|(?P<size>\d+)) # Match `-` or at least one digit.
\s*$ # Match any number of whitespaces (to be discarded).
"#)
.expect("failed compiling regex for common log")
]
});
// - Apache HTTP Server docs: https://httpd.apache.org/docs/1.3/logs.html#combined
pub(crate) static REGEX_APACHE_COMBINED_LOG: Lazy<Vec<Regex>> = Lazy::new(|| {
vec![
Regex::new(
r#"(?x) # Ignore whitespace and comments in the regex expression.
^\s* # Start with any number of whitespaces.
(-|(?P<host>.*?))\s+ # Match `-` or any character (non-greedily) and at least one whitespace.
(-|(?P<identity>.*?))\s+ # Match `-` or any character (non-greedily) and at least one whitespace.
(-|(?P<user>.*?))\s+ # Match `-` or any character (non-greedily) and at least one whitespace.
(-|\[(-|(?P<timestamp>[^\[]*))\])\s+ # Match `-` or `[` followed by `-` or any character except `]`, `]` and at least one whitespace.
(-|"(-|(\s* # Match `-` or `"` followed by `-` or and any number of whitespaces...
(?P<message>( # Match a request with...
(?P<method>\w+)\s+ # Match at least one word character and at least one whitespace.
(?P<path>[[\\"][^"]]*?)\s+ # Match any character except `"`, but `\"` (non-greedily) and at least one whitespace.
(?P<protocol>[[\\"][^"]]*?)\s* # Match any character except `"`, but `\"` (non-greedily) and any number of whitespaces.
|[[\\"][^"]]*?))\s*))" # ...Or match any character except `"`, but `\"`, and any amount of whitespaces.
)\s+ # Match at least one whitespace.
(-|(?P<status>\d+))\s+ # Match `-` or at least one digit and at least one whitespace.
(-|(?P<size>\d+))\s+ # Match `-` or at least one digit.
(-|"(-|(\s* # Match `-` or `"` followed by `-` or and any number of whitespaces...
(?P<referrer>[[\\"][^"]]*?) # Match any character except `"`, but `\"`
"))) # Match the closing quote
\s+ # Match whitespace
(-|"(-|(\s* # Match `-` or `"` followed by `-` or and any number of whitespaces...
(?P<agent>[[\\"][^"]]*?) # Match any character except `"`, but `\"`
"))) # Match the closing quote
#\s*$ # Match any number of whitespaces (to be discarded).
"#)
.expect("failed compiling regex for combined log")
]
});
// It is possible to customise the format output by apache.
pub(crate) static REGEX_APACHE_ERROR_LOG: Lazy<Vec<Regex>> = Lazy::new(|| {
vec![
// Simple format
// https://github.com/mingrammer/flog/blob/9bc83b14408ca446e934c32e4a88a81a46e78d83/log.go#L16
Regex::new(
r"(?x) # Ignore whitespace and comments in the regex expression.
^\s* # Start with any number of whitespaces.
(-|\[(-|(?P<timestamp>[^\[]*))\])\s+ # Match `-` or `[` followed by `-` or any character except `]`, `]` and at least one whitespace.
(-|\[(-|(?P<module>[^:]*): # Match `-` or `[` followed by `-` or any character except `:`.
(?P<severity>[^\[]*))\])\s+ # Match ary character except `]`, `]` and at least one whitespace.
(-|\[\s*pid\s*(-|(?P<pid>[^:]*) # Match `-` or `[` followed by `pid`, `-` or any character except `:`.
(:\s*tid\s*(?P<thread>[^\[]*))?)\])\s # Match `tid` followed by any character except `]`, `]` and at least one whitespace.
(-|\[\s*client\s*(-|(?P<client>.*:?): # Match `-` or `[` followed by `client`, `-` or any character until the first or last `:` for the port.
(?P<port>[^\[]*))\])\s # Match `-` or `[` followed by `-` or any character except `]`, `]` and at least one whitespace.
(-|(?P<message>.*)) # Match `-` or any character.
\s*$ # Match any number of whitespaces (to be discarded).
")
.expect("failed compiling regex for error log"),
// threaded MPM format
// https://httpd.apache.org/docs/current/mod/core.html#errorlogformat
Regex::new(
r"(?x) # Ignore whitespace and comments in the regex expression.
^\s* # Start with any number of whitespaces.
\[(?P<timestamp>[^\]]+)\]\s+ # [%{u}t]
\[(-|(?P<module>[^:]+)):(?P<severity>[^\]]+)\]\s+ # [%-m:%l]
\[pid\s+(?P<pid>\d+)(:tid\s+(?P<thread>\d+))?\]\s+ # [pid %P:tid %T]
(?P<message1>[^\[]*?:\s+([^\[]*?:\s+)?)? # %7F: %E:
(\[client\s+(?P<client>.+?):(?P<port>\d+)\]\s+)? # [client\ %a]
(?P<message2>.*) # %M
(, referer .*)? # ,\ referer\ %{Referer}
\s*$ # Match any number of whitespaces (to be discarded).
")
.expect("failed compiling regex for error log")
]
});
// - Nginx HTTP Server docs: http://nginx.org/en/docs/http/ngx_http_log_module.html
pub(crate) static REGEX_NGINX_COMBINED_LOG: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"(?x) # Ignore whitespace and comments in the regex expression.
^\s* # Start with any number of whitespaces.
(-|(?P<client>\S+))\s+ # Match `-` or any non space character
\-\s+ # Always a dash
(-|(?P<user>\S+))\s+ # Match `-` or any non space character
\[(?P<timestamp>.+)\]\s+ # Match date between brackets
"(?P<request>[^"]*)"\s+ # Match any non double-quote character
(?P<status>\d+)\s+ # Match numbers
(?P<size>\d+)\s+ # Match numbers
"(?P<referer>[^"]*)"\s+ # Match any non double-quote character
"(?P<agent>[^"]*)" # Match any non double-quote character
(\s+"(-|(?P<compression>[^"]+))")? # Match `-` or any non double-quote character
\s*$ # Match any number of whitespaces (to be discarded).
"#)
.expect("failed compiling regex for Nginx combined log")
});
// - Ingress Nginx docs: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/log-format/
pub(crate) static REGEX_INGRESS_NGINX_UPSTREAMINFO_LOG: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"(?x) # Ignore whitespace and comments in the regex expression.
^\s* # Start with any number of whitespaces
(-|(?P<remote_addr>\S+))\s+ # Match `-` or any non space character
\-\s+ # Always a dash
(-|(?P<remote_user>\S+))\s+ # Match `-` or any non space character
\[(?P<timestamp>[^\]]+)\]\s+ # Match date between brackets
"(?P<request>[^"]*)"\s+ # Match any non double-quote character
(?P<status>\d+)\s+ # Match numbers
(?P<body_bytes_size>\d+)\s+ # Match numbers
"(-|(?P<http_referer>[^"]+))"\s+ # Match `-` or any non double-quote character
"(-|(?P<http_user_agent>[^"]+))"\s+ # Match `-` or any non double-quote character
(?P<request_length>\d+)\s+ # Match numbers
(?P<request_time>\d+\.\d+)\s+ # Match numbers with dot
\[(?P<proxy_upstream_name>[^\]]+)\]\s+ # Match all characters within square brackets
\[(?P<proxy_alternative_upstream_name>[^\]]+)?\]\s+ # Match all characters within square brackets, optional
(?P<upstream_addr>\S+)\s+ # Match any non space character
(-|(?P<upstream_response_length>\d+))\s+ # Match `-` or numbers
(-|(?P<upstream_response_time>\d+\.\d+))\s+ # Match `-` or numbers with dot
(-|(?P<upstream_status>\d+))\s+ # Match `-` or numbers
(?P<req_id>\S+) # Match any non space character
\s*$ # Match any number of whitespaces (to be discarded).
"#)
.expect("failed compiling regex for Ingress Nginx upstreaminfo log")
});
pub(crate) static REGEX_NGINX_ERROR_LOG: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"(?x) # Ignore whitespace and comments in the regex expression.
^\s* # Start with any number of whitespaces.
(?P<timestamp>.+)\s+ # Match any character until [
\[(?P<severity>\w+)\]\s+ # Match any word character
(?P<pid>\d+)\# # Match any number
(?P<tid>\d+): # Match any number
(\s+\*(?P<cid>\d+))? # Match any number
\s+(?P<message>[^,]*) # Match any character
(,\s+excess:\s+(?P<excess>[^\s]+)\sby\szone\s"(?P<zone>[^,]+)")? # Match any character after ', excess: ' until ' by zone ' and the rest of characters
(,\s+client:\s+(?P<client>[^,]+))? # Match any character after ', client: '
(,\s+server:\s+(?P<server>[^,]*))? # Match any character after ', server: '
(,\s+request:\s+"(?P<request>[^"]*)")? # Match any character after ', request: '
(,\s+upstream:\s+"(?P<upstream>[^"]*)")? # Match any character after ', upstream: '
(,\s+host:\s+"(?P<host>[^"]*)")? # Match any character then ':' then any character after ', host: '
(,\s+refer?rer:\s+"(?P<referer>[^"]*)")? # Match any character after ', referrer: '
\s*$ # Match any number of whitespaces (to be discarded).
"#)
.expect("failed compiling regex for Nginx error log")
});
// Parse the time as Utc from the given timezone
fn parse_time(
time: &str,
format: &str,
timezone: &TimeZone,
) -> std::result::Result<DateTime<Utc>, String> {
timezone
.datetime_from_str(time, format)
.or_else(|_| DateTime::parse_from_str(time, format).map(Into::into))
.map_err(|err| format!("failed parsing timestamp {time} using format {format}: {err}"))
}
/// Takes the field as a string and returns a `Value`.
/// Most fields are `Value::Bytes`, but some are other types, we convert to those
/// types based on the fieldname.
fn capture_value(
name: &str,
value: &str,
timestamp_format: &str,
timezone: &TimeZone,
) -> std::result::Result<Value, String> {
Ok(match name {
"timestamp" => Value::Timestamp(parse_time(value, timestamp_format, timezone)?),
"status"
| "size"
| "pid"
| "tid"
| "cid"
| "port"
| "body_bytes_size"
| "request_length"
| "upstream_response_length"
| "upstream_status" => Value::Integer(
value
.parse()
.map_err(|_| format!("failed parsing {name}"))?,
),
"excess" | "request_time" | "upstream_response_time" => Value::Float(
value
.parse()
.map_err(|_| format!("failed parsing {name}"))?,
),
_ => Value::Bytes(value.to_owned().into()),
})
}
/// Extracts the log fields from the regex and adds them to a `Value::Object`.
pub(crate) fn log_fields(
regex: &Regex,
captures: &Captures,
timestamp_format: &str,
timezone: &TimeZone,
) -> std::result::Result<Value, String> {
Ok(regex
.capture_names()
.filter_map(|name| {
name.and_then(|name| {
captures.name(name).map(|value| {
Ok((
name.to_string(),
capture_value(name, value.as_str(), timestamp_format, timezone)?,
))
})
})
})
.collect::<std::result::Result<BTreeMap<String, Value>, String>>()?
.into())
}
/// Attempts to extract log fields from each of the list of regexes
pub(crate) fn parse_message(
regexes: &Vec<Regex>,
message: &str,
timestamp_format: &str,
timezone: &TimeZone,
log_type: &str,
) -> std::result::Result<Value, String> {
for regex in regexes {
if let Some(captures) = regex.captures(message) {
return log_fields(regex, &captures, timestamp_format, timezone);
}
}
Err(format!("failed parsing {log_type} log line"))
}