docker 部署tig监控服务

前言

tig对应的服务是influxdb grafana telegraf
此架构比传统的promethus架构更为简洁，虽然influxdb开源方案没有集群部署，但是对于中小型服务监控需求该方案简单高效
本文以docker-compose来演示这套监控体系的快速搭建和效果。

部署

docker-compose.yaml

version: '3'
networks:monitor:driver: bridge
#配置应用
services:#grafana 报警推送 #账号密码 prometheusalert prometheusalertprometheusalert:image: feiyu563/prometheus-alertcontainer_name:   prometheusalerthostname:   prometheusalertrestart: alwaysports:- 8087:8080networks:- monitorvolumes:- ./docker/prometheusalert/conf:/app/conf- ./docker/prometheusalert/db:/app/dbenvironment:- PA_LOGIN_USER=prometheusalert- PA_LOGIN_PASSWORD=prometheusalert- PA_TITLE=PrometheusAlert- PA_OPEN_FEISHU=1#界面展示 默认账号密码 admin admingrafana:image: grafana/grafanacontainer_name: grafanahostname: grafanarestart: alwaysvolumes:- ./docker/grafana/data/grafana:/var/lib/grafanaports:- "3000:3000"networks:- monitor#influxdb数据库v2自带管理端#账号密码 root rootinfluxdb:image: influxdbcontainer_name: influxdbenvironment:INFLUX_DB: test                   # 可能无效INFLUXDB_USER: root               # 可能无效INFLUXDB_USER_PASSWORD: root      # 可能无效ports:- "8086:8086"restart: alwaysvolumes:- ./docker/influxdb/:/var/lib/influxdbnetworks:- monitor#indluxdb数据库v1#influxdb1x: #  image: influxdb:1.8#  container_name: influxdb1.8#  environment:#    INFLUXDB_DB: test#    INFLUXDB_ADMIN_ENABLED: true#    INFLUXDB_ADMIN_USER: root#    INFLUXDB_ADMIN_PASSWORD: root#  ports:#    - "8098:8086"#  restart: always#  volumes:#    - ./docker/influxdb1x/influxdb1x.conf:/etc/influxdb/influxdb.conf#    - ./docker/influxdb1x/:/var/lib/influxdb#  networks:#    - monitor

telegraf 安装官方文档

# telegraf是采集端，部署于监控数据的源头，详细的部署教程可以通过官网，下面以linux服务器为例子
# 编写源
cat <<EOF | sudo tee /etc/yum.repos.d/influxdb.repo
[influxdb]
name = InfluxData Repository - Stable
baseurl = https://repos.influxdata.com/stable/\$basearch/main
enabled = 1
gpgcheck = 1
gpgkey = https://repos.influxdata.com/influxdata-archive_compat.key
EOF# 安装
sudo yum install telegraf# 校验
telegraf --help

使用

登陆influxdb http://localhost:8086.
- 首次登陆会创建账号
- org是分区意思
- buk是库的概念
配置telegraf采集etl流程官方telegraf采集插件介绍
- 配置influxdb数据的访问的token
- 配置telegraf采集的配置文件
- 采集端启动telegraf采集etl
  a. 配置tokenb. 可以通过平台生成配置文件，也可以自己保存配置文件。平台生成配置则提供http接口远程提供配置下载
  以nginx文件为例提供配置
  telegraf.conf

# Configuration for telegraf agent 
# telegraf 采集端配置都是默认配置
[agent]## Default data collection interval for all inputsinterval = "10s"## Rounds collection interval to 'interval'## ie, if interval="10s" then always collect on :00, :10, :20, etc.round_interval = true## Telegraf will send metrics to outputs in batches of at most## metric_batch_size metrics.## This controls the size of writes that Telegraf sends to output plugins.metric_batch_size = 1000## Maximum number of unwritten metrics per output.  Increasing this value## allows for longer periods of output downtime without dropping metrics at the## cost of higher maximum memory usage.metric_buffer_limit = 10000## Collection jitter is used to jitter the collection by a random amount.## Each plugin will sleep for a random time within jitter before collecting.## This can be used to avoid many plugins querying things like sysfs at the## same time, which can have a measurable effect on the system.collection_jitter = "0s"## Default flushing interval for all outputs. Maximum flush_interval will be## flush_interval + flush_jitterflush_interval = "10s"## Jitter the flush interval by a random amount. This is primarily to avoid## large write spikes for users running a large number of telegraf instances.## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15sflush_jitter = "0s"## By default or when set to "0s", precision will be set to the same## timestamp order as the collection interval, with the maximum being 1s.##   ie, when interval = "10s", precision will be "1s"##       when interval = "250ms", precision will be "1ms"## Precision will NOT be used for service inputs. It is up to each individual## service input to set the timestamp at the appropriate precision.## Valid time units are "ns", "us" (or "µs"), "ms", "s".precision = ""## Log at debug level.# debug = false## Log only error level messages.# quiet = false## Log target controls the destination for logs and can be one of "file",## "stderr" or, on Windows, "eventlog".  When set to "file", the output file## is determined by the "logfile" setting.# logtarget = "file"## Name of the file to be logged to when using the "file" logtarget.  If set to## the empty string then logs are written to stderr.# logfile = ""## The logfile will be rotated after the time interval specified.  When set## to 0 no time based rotation is performed.  Logs are rotated only when## written to, if there is no log activity rotation may be delayed.# logfile_rotation_interval = "0d"## The logfile will be rotated when it becomes larger than the specified## size.  When set to 0 no size based rotation is performed.# logfile_rotation_max_size = "0MB"## Maximum number of rotated archives to keep, any older logs are deleted.## If set to -1, no archives are removed.# logfile_rotation_max_archives = 5## Pick a timezone to use when logging or type 'local' for local time.## Example: America/Chicago# log_with_timezone = ""## Override default hostname, if empty use os.Hostname()hostname = ""## If set to true, do no set the "host" tag in the telegraf agent.omit_hostname = false# influxdb_v2 输出插件配置 这里需要配置的
# 数据库地址:urls 分区:organization 库:bucket 授权token:token
[[outputs.influxdb_v2]]## The URLs of the InfluxDB cluster nodes.#### Multiple URLs can be specified for a single cluster, only ONE of the## urls will be written to each interval.##   ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"]urls = ["http://localhost:8086"] ## Token for authentication.token = "上一步创建的token"## Organization is the name of the organization you wish to write to; must exist.organization = "创建的分区 这里是test"## Destination bucket to write into.bucket = "创建的表 这里是test"## The value of this tag will be used to determine the bucket.  If this## tag is not set the 'bucket' option is used as the default.# bucket_tag = ""## If true, the bucket tag will not be added to the metric.# exclude_bucket_tag = false## Timeout for HTTP messages.# timeout = "5s"## Additional HTTP headers# http_headers = {"X-Special-Header" = "Special-Value"}## HTTP Proxy override, if unset values the standard proxy environment## variables are consulted to determine which proxy, if any, should be used.# http_proxy = "http://corporate.proxy:3128"## HTTP User-Agent# user_agent = "telegraf"## Content-Encoding for write request body, can be set to "gzip" to## compress body or "identity" to apply no encoding.# content_encoding = "gzip"## Enable or disable uint support for writing uints influxdb 2.0.# influx_uint_support = false## Optional TLS Config for use on HTTP connections.# tls_ca = "/etc/telegraf/ca.pem"# tls_cert = "/etc/telegraf/cert.pem"# tls_key = "/etc/telegraf/key.pem"## Use TLS but skip chain & host verification# insecure_skip_verify = false# Parse the new lines appended to a file
# tail 输入插件配置,以监听nginx日志为例 需要配置
# 监听文件位置 files 
# nginx行数据解析表达式 grok_patterns 提取监控字段，gork表达式不单独说明了
# nginx监控数据存储表名 name_override
[[inputs.tail]]## File names or a pattern to tail.## These accept standard unix glob matching rules, but with the addition of## ** as a "super asterisk". ie:##   "/var/log/**.log"  -> recursively find all .log files in /var/log##   "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log##   "/var/log/apache.log" -> just tail the apache log file##   "/var/log/log[!1-2]*  -> tail files without 1-2##   "/var/log/log[^1-2]*  -> identical behavior as above## See https://github.com/gobwas/glob for more examples##files = ["/logs/nginx/access_main.log"]## Read file from beginning.#from_beginning = false## Whether file is a named pipe# pipe = false## Method used to watch for file updates.  Can be either "inotify" or "poll".# watch_method = "inotify"## Maximum lines of the file to process that have not yet be written by the## output.  For best throughput set based on the number of metrics on each## line and the size of the output's metric_batch_size.# max_undelivered_lines = 1000## Character encoding to use when interpreting the file contents.  Invalid## characters are replaced using the unicode replacement character.  When set## to the empty string the data is not decoded to text.##   ex: character_encoding = "utf-8"##       character_encoding = "utf-16le"##       character_encoding = "utf-16be"##       character_encoding = ""# character_encoding = ""## Data format to consume.## Each data format has its own unique set of configuration options, read## more about them here:## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.mdgrok_patterns = ["%{NGINX_ACCESS_LOG}"]name_override = "nginx_access_log"#grok_custom_pattern_files = []#grok_custom_patterns = '''#NGINX_ACCESS_LOG %{IP:remote_addr} - (-|%{WORD:remote_user}) [%{HTTPDATE:time_local}] %{BASE10NUM:request_time:float} (-|%{BASE10NUM:upstream_response_time:float}) %{IPORHOST:host} %{QS:request} %{NUMBER:status:int} %{NUMBER:body_bytes_sent:int} %{QS:referrer} %{QS:agent} %{IPORHOST:xforwardedfor}#'''grok_custom_patterns = '''NGINX_ACCESS_LOG %{IP:remote_addr} - (-|%{WORD:remote_user:drop}) \[%{HTTPDATE:ts:ts}\] %{BASE10NUM:request_time:float} %{BASE10NUM:upstream_response_time:float} %{IPORHOST:host:tag} "(?:%{WORD:verb:drop} %{NOTSPACE:request:tag}(?: HTTP/%{NUMBER:http_version:drop})?|%{DATA:rawrequest})" %{NUMBER:status:tag} (?:%{NUMBER:resp_bytes}|-)  %{QS:referrer:drop} %{QS:agent:drop} %{QS:xforwardedfor:drop}'''grok_timezone = "Local"data_format = "grok"## Set the tag that will contain the path of the tailed file. If you don't want this tag, set it to an empty string.# path_tag = "path"## multiline parser/codec## https://www.elastic.co/guide/en/logstash/2.4/plugins-filters-multiline.html#[inputs.tail.multiline]## The pattern should be a regexp which matches what you believe to be an## indicator that the field is part of an event consisting of multiple lines of log data.#pattern = "^\s"## This field must be either "previous" or "next".## If a line matches the pattern, "previous" indicates that it belongs to the previous line,## whereas "next" indicates that the line belongs to the next one.#match_which_line = "previous"## The invert_match field can be true or false (defaults to false).## If true, a message not matching the pattern will constitute a match of the multiline## filter and the what will be applied. (vice-versa is also true)#invert_match = false## After the specified timeout, this plugin sends a multiline event even if no new pattern## is found to start a new event. The default timeout is 5s.#timeout = 5s

gork表达式举例

# nginx 日志格式
'$remote_addr - $remote_user [$time_local] $request_time $upstream_response_time $host "$request" ''$status $body_bytes_sent  "$http_referer" ''"$http_user_agent" "$http_x_forwarded_for"'# grok  解析格式              NGINX_ACCESS_LOG %{IP:remote_addr} - (-|%{WORD:remote_user:drop}) \[%{HTTPDATE:ts:ts}\] %{BASE10NUM:request_time:float} %{BASE10NUM:upstream_response_time:float} %{IPORHOST:host:tag} "(?:%{WORD:verb:drop} %{NOTSPACE:request:tag}(?: HTTP/%{NUMBER:http_version:drop})?|%{DATA:rawrequest})" %{NUMBER:status:tag} (?:%{NUMBER:resp_bytes}|-)  %{QS:referrer:drop} %{QS:agent:drop} %{QS:xforwardedfor:drop}# nginx 日志举例
1.1.1.2 - - [30/Jan/2023:02:27:24 +0000] 0.075 0.075 xxx.xxx.xxx "POST /api/xxx/xxx/xxx HTTP/1.1" 200 69  "https://xxx.xxx.xxx/" "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" "1.1.1.1"# grok 解析变量如下再取变量生成influxdb行协议
{"NGINX_ACCESS_LOG": [["1.1.1.2 - - [30/Jan/2023:02:27:24 +0000] 0.075 0.075 prod.webcomicsapp.com "POST /api/xxx/xxx/xxx HTTP/1.1" 200 69  "https://xxx.xxx.xxx/" "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" "1.46.138.190""]],"remote_addr": [["1.1.1.2"]],"IPV6": [[null,null]],"IPV4": [["1.1.1.2",null]],"remote_user": [[null]],"ts": [["30/Jan/2023:02:27:24 +0000"]],"MONTHDAY": [["30"]],"MONTH": [["Jan"]],"YEAR": [["2023"]],"TIME": [["02:27:24"]],"HOUR": [["02"]],"MINUTE": [["27"]],"SECOND": [["24"]],"INT": [["+0000"]],"request_time": [["0.075"]],"upstream_response_time": [["0.075"]],"host": [["xxx.xxx.xxx"]],"HOSTNAME": [["xxx.xxx.xxx"]],"IP": [[null]],"verb": [["POST"]],"request": [["/api/xxx/xxx/xxx"]],"http_version": [["1.1"]],"BASE10NUM": [["1.1","200","69"]],"rawrequest": [[null]],"status": [["200"]],"resp_bytes": [["69"]],"referrer": [[""https://xxx.xxx.xxx/""]],"QUOTEDSTRING": [[""https://xxx.xxx.xxx/"",""Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"",""1.1.1.1""]],"agent": [[""Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148""]],"xforwardedfor": [[""1.1.1.1""]]
}

启动telegraf

# 测试启动
nohup  telegraf --config telegraf.conf --debug
# 退出测试
ctl+c
# 后台进程开启
nohup  telegraf --config telegraf.conf >/dev/null 2>&1 &
# 关闭后台进程
ps -aux | grep telegraf
kill -9 '对应pid'