visit
To measure how fast your storage is in a Linux environment, you can use the dd
command (in my case, it's gdd
from coreutils
on macOS). These numbers can tell you the best possible speed of processing.
gdd if=/dev/zero bs=1024k of=testfile count=1024 oflag=dsync && \
gdd if=testfile bs=1024k of=/dev/null count=1024 oflag=dsync && \
rm testfile
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB, 1.0 GiB) copied, 0.987115 s, 1.1 GB/s // Writing
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB, 1.0 GiB) copied, 0.147887 s, 7.3 GB/s // Reading
log_format combined '$remote_addr - $remote_user [$time_local] '
'"$request" $status $body_bytes_sent '
'"$http_referer" "$http_user_agent"';
...
127.0.0.1 - - [19/Nov/2021:14:32:11 +0000] "GET / HTTP/1.1" 200 1664 "-" "curl/7.77.0"
127.0.0.1 - - [19/Nov/2021:14:32:11 +0000] "GET /teapot HTTP/1.1" 418 0 "-" "curl/7.77.0"
127.0.0.1 - - [19/Nov/2021:14:32:11 +0000] "GET /foo HTTP/1.1" 404 153 "-" "curl/7.77.0"
log_format combined-json escape=json
'{'
'"remote_addr":"$remote_addr",'
'"remote_user":"$remote_user",'
'"time_local":"$time_local",'
'"request":"$request",'
'"status": "$status",'
'"body_bytes_sent":"$body_bytes_sent",'
'"http_referrer":"$http_referer",'
'"http_user_agent":"$http_user_agent"'
'}';
...
{"remote_addr":"127.0.0.1","remote_user":"","time_local":"19/Nov/2021:14:31:28 +0000","request":"GET / HTTP/1.1","status": "200","body_bytes_sent":"1664","http_referrer":"","http_user_agent":"curl/7.77.0"}
{"remote_addr":"127.0.0.1","remote_user":"","time_local":"19/Nov/2021:14:31:28 +0000","request":"GET /teapot HTTP/1.1","status": "418","body_bytes_sent":"0","http_referrer":"","http_user_agent":"curl/7.77.0"}
{"remote_addr":"127.0.0.1","remote_user":"","time_local":"19/Nov/2021:14:31:28 +0000","request":"GET /foo HTTP/1.1","status": "404","body_bytes_sent":"153","http_referrer":"","http_user_agent":"curl/7.77.0"}
When you need to extract and calculate some data from logs, Linux command-line tools are the first thing that comes to mind. These tools (or utilities) are not created specifically to process logs but are very flexible and powerful when combined together. And what is convenient - they are already available. Here are some examples of how tools like grep
, awk
, and others could be used.
time grep "\" 405 " access.log | wc -l
11305
________________________________________________________
Executed in 58.56 secs fish external
usr time 57.02 secs 0.27 millis 57.02 secs
sys time 1.22 secs 1.32 millis 1.22 secs
grep "405" access.log | wc -l # Wrong result!
8824
time awk '{print $9}' access.log | sort | uniq -c | sort -nr
21510839 404
889766 200
115696 400
14718 304
11305 405
8777 206
3948 416
3146 403
2908 411
1097 HTTP/1.0\" # Looks like log lines contained space in $remote_user field!
26 /vicidial/admin.php
26 /scripts/admin.php
26 /cgi-bin/admin.php
26 /admin.php
7 499
4 412
________________________________________________________
Executed in 24.48 secs fish external
usr time 30.88 secs 0.39 millis 30.88 secs
sys time 1.99 secs 1.68 millis 1.99 secs
perl -ne 'print if s/^(.*?) - .*? \[([^:]*?):(.*?) .*?\] "(.*?) (.*?) (HTTP\/.*?)" (.*?) (.*?) "(.*?)" "(.*?)"$/$1,"$2 $3",$4,"$5",$6,$7,$8,"$9","$10"/g' < access.log > access.csv
# MacOS x86_64
curl -O '//builds.clickhouse.com/master/macos/clickhouse' && chmod a+x ./clickhouse
# Create directory for ClickHouse data files/directories
mkdir data
# Clickhouse will use the current directory to create files
cd data
# Start the server
./../clickhouse server
Processing configuration file 'config.xml'.
There is no file 'config.xml', will use embedded config.
Logging trace to console
2021.12.31 19:49:27.682457 [ 3587768 ] {} <Information> : Starting ClickHouse 21.12.1.8928 with revision 54457, no build id, PID 14099
./clickhouse client --multiline
ClickHouse client version 21.12.1.8928 (official build).
Connecting to localhost:9000 as user default.
Connected to ClickHouse server version 21.12.1 revision 54451.
:) CREATE DATABASE IF NOT EXISTS logs;
CREATE DATABASE IF NOT EXISTS logs
Query id: 3c376c00-1969-41df-af0c-099174e2e32b
Ok.
0 rows in set. Elapsed: 0.002 sec.
:) CREATE TABLE IF NOT EXISTS logs.access
:-] (
:-] remote_addr String,
:-] time_local DateTime,
:-] request_method String,
:-] request_uri String,
:-] http_version String,
:-] status Int32,
:-] body_bytes_sent Int64,
:-] http_referrer String,
:-] http_user_agent String
:-] )
:-] ENGINE = MergeTree()
:-] ORDER BY tuple();
CREATE TABLE IF NOT EXISTS logs.access
(
`remote_addr` String,
`time_local` DateTime,
`request_method` String,
`request_uri` String,
`http_version` String,
`status` Int32,
`body_bytes_sent` Int64,
`http_referrer` String,
`http_user_agent` String
)
ENGINE = MergeTree
ORDER BY tuple()
Query id: 501fa6b3-a16e-4c97-bb40-2d7d58b997ba
Ok.
0 rows in set. Elapsed: 0.005 sec.
:) ^C
:) Bye.
# Load data to logs.access table from csv file
./clickhouse client --date_time_input_format='best_effort' --query="INSERT INTO logs.access FORMAT CSV" < access.csv
:) SELECT count() FROM logs.access WHERE status=405;
SELECT count()
FROM logs.access
WHERE status = 405
Query id: 76b40738-354c-4ad0-a815-6bf9e211fb54
┌─count()─┐
│ 11305 │
└─────────┘
1 rows in set. Elapsed: 0.037 sec. Processed 22.56 million rows, 90.25 MB (607.73 million rows/s., 2.43 GB/s.)
:) SELECT count(), status FROM logs.access GROUP BY status ORDER BY count() DESC;
SELECT
count(),
status
FROM logs.access
GROUP BY status
ORDER BY count() DESC
Query id: f9d01465-54ca-4e73-a961-be2f695ded91
┌──count()─┬─status─┐
│ 21512196 │ 404 │
│ 889766 │ 200 │
│ 115800 │ 400 │
│ 14718 │ 304 │
│ 11305 │ 405 │
│ 8777 │ 206 │
│ 3948 │ 416 │
│ 3146 │ 403 │
│ 2908 │ 411 │
│ 7 │ 499 │
│ 4 │ 412 │
└──────────┴────────┘
11 rows in set. Elapsed: 0.045 sec. Processed 22.56 million rows, 90.25 MB (502.02 million rows/s., 2.01 GB/s.)