Logstash 7.0.0的简单使用操作

Author Avatar
山小杰 10月 01, 2019
  • 在其它设备中阅读本文章

本文是Logstash 7.0.0的同步任务的详细配置说明。
主要是自定义mapping以及多个数据源和目标的配置,我就直接放同步任务配置文件的内容了。
需要注意的是:

  • 严禁过滤掉Logstash自动生成的type字段,否则会导致无法正常同步。
  • output中的type值需要与input中jdbc的type值相对应。

同步任务配置文件示例

input {
stdin {}
jdbc {
type => "audit_law"
# 数据库连接地址
jdbc_connection_string => "jdbc:mysql://192.168.0.196:3306/auditbase?characterEncoding=UTF-8&autoReconnect=true&useSSL=false"
# 数据库连接账号密码;
jdbc_user => "aaaa"
jdbc_password => "123456"
# MySQL依赖包路径;
jdbc_driver_library => "lib/jars/mysql-connector-java-5.1.41-bin.jar"
# the name of the driver class for mysql
jdbc_driver_class => "com.mysql.jdbc.Driver"
# 解决中文乱码
codec => plain {charset => "UTF-8"}
# 数据库重连尝试次数
connection_retry_attempts => "3"
# 判断数据库连接是否可用,默认false不开启
jdbc_validate_connection => "true"
# 数据库连接可用校验超时时间,默认3600S
jdbc_validation_timeout => "3600"
# 开启分页查询(默认false不开启);
jdbc_paging_enabled => "true"
# 单次分页查询条数(默认100000,若字段较多且更新频率较高,建议调低此值);
jdbc_page_size => "5000"
# statement为查询数据sql,如果sql较复杂,建议配通过statement_filepath配置sql文件的存放路径;
# sql_last_value为内置的变量,存放上次查询结果中最后一条数据tracking_column的值,此处即为ModifyTime;
# statement_filepath => "mysql/jdbc.sql"
statement => "SELECT a.ID,'01' AS TYPE,a.NAME,a.DOCUMENT_NO,a.INDUSTRY_NAME,a.REGION_NAME,b.CONTENT,a.PUBLISH_TIME FROM audit_law a LEFT JOIN tab_file_content b ON a.CONTENT_ID = b.ID WHERE a.STATE_CODE='03' AND PUBLISH_TIME > :sql_last_value ORDER BY PUBLISH_TIME ASC"
# 是否将字段名转换为小写,默认true(如果有数据序列化、反序列化需求,建议改为false);
lowercase_column_names => false
# Value can be any of: fatal,error,warn,info,debug,默认info;
sql_log_level => warn
# 是否记录上次执行结果,true表示会将上次执行结果的tracking_column字段的值保存到last_run_metadata_path指定的文件中;
record_last_run => true
# 需要记录查询结果某字段的值时,此字段为true,否则默认tracking_column为timestamp的值;
use_column_value => true
# 需要记录的字段,用于增量同步,需是数据库字段
tracking_column => "PUBLISH_TIME"
# Value can be any of: numeric,timestamp,Default value is "numeric"
tracking_column_type => timestamp
# record_last_run上次数据存放位置;
last_run_metadata_path => "mysql/audit_law_last_id.txt"
# 是否清除last_run_metadata_path的记录,需要增量同步时此字段必须为false;
clean_run => false
# 同步频率(分 时 天 月 年),默认每分钟同步一次;
schedule => "* * * * *"
}
jdbc {
type => "audit_basis"
# 数据库连接地址
jdbc_connection_string => "jdbc:mysql://192.168.0.196:3306/auditbase?characterEncoding=UTF-8&autoReconnect=true&useSSL=false"
# 数据库连接账号密码;
jdbc_user => "aaaa"
jdbc_password => "123456"
# MySQL依赖包路径;
jdbc_driver_library => "lib/jars/mysql-connector-java-5.1.41-bin.jar"
# the name of the driver class for mysql
jdbc_driver_class => "com.mysql.jdbc.Driver"
# 解决中文乱码
codec => plain {charset => "UTF-8"}
# 数据库重连尝试次数
connection_retry_attempts => "3"
# 判断数据库连接是否可用,默认false不开启
jdbc_validate_connection => "true"
# 数据库连接可用校验超时时间,默认3600S
jdbc_validation_timeout => "3600"
# 开启分页查询(默认false不开启);
jdbc_paging_enabled => "true"
# 单次分页查询条数(默认100000,若字段较多且更新频率较高,建议调低此值);
jdbc_page_size => "5000"
# statement为查询数据sql,如果sql较复杂,建议配通过statement_filepath配置sql文件的存放路径;
# sql_last_value为内置的变量,存放上次查询结果中最后一条数据tracking_column的值,此处即为ModifyTime;
# statement_filepath => "mysql/jdbc.sql"
statement => "SELECT * FROM (SELECT a.ID,'02' as TYPE,a.QUESTION_NAME AS NAME,a.AUDIT_ITEM_NAME,b.AUDIT_LAW_NAME,GROUP_CONCAT(b.LAW_ITEM_TEXT SEPARATOR '\n') AS LAW_TEXT,MAX(b.PUBLISH_TIME) AS PUBLISH_TIME FROM audit_basis a LEFT JOIN audit_basis_detail b ON a.ID = b.AUDIT_BASIS_ID WHERE b.STATE_CODE = '02' GROUP BY a.ID) c WHERE PUBLISH_TIME > :sql_last_value ORDER BY PUBLISH_TIME ASC"
# 是否将字段名转换为小写,默认true(如果有数据序列化、反序列化需求,建议改为false);
lowercase_column_names => false
# Value can be any of: fatal,error,warn,info,debug,默认info;
sql_log_level => warn
# 是否记录上次执行结果,true表示会将上次执行结果的tracking_column字段的值保存到last_run_metadata_path指定的文件中;
record_last_run => true
# 需要记录查询结果某字段的值时,此字段为true,否则默认tracking_column为timestamp的值;
use_column_value => true
# 需要记录的字段,用于增量同步,需是数据库字段
tracking_column => "PUBLISH_TIME"
# Value can be any of: numeric,timestamp,Default value is "numeric"
tracking_column_type => timestamp
# record_last_run上次数据存放位置;
last_run_metadata_path => "mysql/audit_basis_last_id.txt"
# 是否清除last_run_metadata_path的记录,需要增量同步时此字段必须为false;
clean_run => false
# 同步频率(分 时 天 月 年),默认每分钟同步一次;
schedule => "* * * * *"
}
}
filter {
json {
source => "message"
remove_field => ["message"]
}
# 去除自动添加的 "@timestamp", "@version" 字段
# 严禁过滤 "type" 字段,会导致无法成功同步数据
mutate {
remove_field => ["@timestamp", "@version"]
}
}
output {
# type 值需与 jdbc 中的 type 值相对应
if [type] == "audit_law" {
elasticsearch {
# 配置ES集群地址
hosts => ["192.168.0.62:9200"]
# 索引名字,必须小写
index => "audit_law"
# 数据唯一索引(建议使用数据库KeyID)
document_id => "%{ID}"
template_overwrite => true
template => "mysql/audit_law_mapping.json"
}
}
if [type] == "audit_basis" {
elasticsearch {
# 配置ES集群地址
hosts => ["192.168.0.62:9200"]
# 索引名字,必须小写
index => "audit_basis"
# 数据唯一索引(建议使用数据库KeyID)
document_id => "%{ID}"
template_overwrite => true
template => "mysql/audit_basis_mapping.json"
}
}
stdout {
codec => json_lines
}
}

自定义mapping文件示例

{
"settings": {
"number_of_shards" : 1,
"number_of_replicas" : 0,
"analysis.char_filter":["html_strip"]
},
"mappings": {
"properties": {
"ID":{
"type":"keyword"
},
"TYPE":{
"type":"keyword"
},
"NAME":{
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"DOCUMENT_NO":{
"type":"keyword"
},
"INDUSTRY_NAME":{
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"REGION_NAME":{
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"CONTENT":{
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"PUBLISH_TIME":{
"type":"date"
}
}
}
}