本文是Logstash 7.0.0的同步任务的详细配置说明。
主要是自定义mapping以及多个数据源和目标的配置,我就直接放同步任务配置文件的内容了。
需要注意的是:
- 严禁过滤掉Logstash自动生成的type字段,否则会导致无法正常同步。
- output中的type值需要与input中jdbc的type值相对应。
同步任务配置文件示例
input { stdin {} jdbc { type => "audit_law" # 数据库连接地址 jdbc_connection_string => "jdbc:mysql://192.168.0.196:3306/auditbase?characterEncoding=UTF-8&autoReconnect=true&useSSL=false" # 数据库连接账号密码; jdbc_user => "aaaa" jdbc_password => "123456" # MySQL依赖包路径; jdbc_driver_library => "lib/jars/mysql-connector-java-5.1.41-bin.jar" # the name of the driver class for mysql jdbc_driver_class => "com.mysql.jdbc.Driver" # 解决中文乱码 codec => plain {charset => "UTF-8"} # 数据库重连尝试次数 connection_retry_attempts => "3" # 判断数据库连接是否可用,默认false不开启 jdbc_validate_connection => "true" # 数据库连接可用校验超时时间,默认3600S jdbc_validation_timeout => "3600" # 开启分页查询(默认false不开启); jdbc_paging_enabled => "true" # 单次分页查询条数(默认100000,若字段较多且更新频率较高,建议调低此值); jdbc_page_size => "5000" # statement为查询数据sql,如果sql较复杂,建议配通过statement_filepath配置sql文件的存放路径; # sql_last_value为内置的变量,存放上次查询结果中最后一条数据tracking_column的值,此处即为ModifyTime; # statement_filepath => "mysql/jdbc.sql" statement => "SELECT a.ID,'01' AS TYPE,a.NAME,a.DOCUMENT_NO,a.INDUSTRY_NAME,a.REGION_NAME,b.CONTENT,a.PUBLISH_TIME FROM audit_law a LEFT JOIN tab_file_content b ON a.CONTENT_ID = b.ID WHERE a.STATE_CODE='03' AND PUBLISH_TIME > :sql_last_value ORDER BY PUBLISH_TIME ASC" # 是否将字段名转换为小写,默认true(如果有数据序列化、反序列化需求,建议改为false); lowercase_column_names => false # Value can be any of: fatal,error,warn,info,debug,默认info; sql_log_level => warn # 是否记录上次执行结果,true表示会将上次执行结果的tracking_column字段的值保存到last_run_metadata_path指定的文件中; record_last_run => true # 需要记录查询结果某字段的值时,此字段为true,否则默认tracking_column为timestamp的值; use_column_value => true # 需要记录的字段,用于增量同步,需是数据库字段 tracking_column => "PUBLISH_TIME" # Value can be any of: numeric,timestamp,Default value is "numeric" tracking_column_type => timestamp # record_last_run上次数据存放位置; last_run_metadata_path => "mysql/audit_law_last_id.txt" # 是否清除last_run_metadata_path的记录,需要增量同步时此字段必须为false; clean_run => false # 同步频率(分 时 天 月 年),默认每分钟同步一次; schedule => "* * * * *" } jdbc { type => "audit_basis" # 数据库连接地址 jdbc_connection_string => "jdbc:mysql://192.168.0.196:3306/auditbase?characterEncoding=UTF-8&autoReconnect=true&useSSL=false" # 数据库连接账号密码; jdbc_user => "aaaa" jdbc_password => "123456" # MySQL依赖包路径; jdbc_driver_library => "lib/jars/mysql-connector-java-5.1.41-bin.jar" # the name of the driver class for mysql jdbc_driver_class => "com.mysql.jdbc.Driver" # 解决中文乱码 codec => plain {charset => "UTF-8"} # 数据库重连尝试次数 connection_retry_attempts => "3" # 判断数据库连接是否可用,默认false不开启 jdbc_validate_connection => "true" # 数据库连接可用校验超时时间,默认3600S jdbc_validation_timeout => "3600" # 开启分页查询(默认false不开启); jdbc_paging_enabled => "true" # 单次分页查询条数(默认100000,若字段较多且更新频率较高,建议调低此值); jdbc_page_size => "5000" # statement为查询数据sql,如果sql较复杂,建议配通过statement_filepath配置sql文件的存放路径; # sql_last_value为内置的变量,存放上次查询结果中最后一条数据tracking_column的值,此处即为ModifyTime; # statement_filepath => "mysql/jdbc.sql" statement => "SELECT * FROM (SELECT a.ID,'02' as TYPE,a.QUESTION_NAME AS NAME,a.AUDIT_ITEM_NAME,b.AUDIT_LAW_NAME,GROUP_CONCAT(b.LAW_ITEM_TEXT SEPARATOR '\n') AS LAW_TEXT,MAX(b.PUBLISH_TIME) AS PUBLISH_TIME FROM audit_basis a LEFT JOIN audit_basis_detail b ON a.ID = b.AUDIT_BASIS_ID WHERE b.STATE_CODE = '02' GROUP BY a.ID) c WHERE PUBLISH_TIME > :sql_last_value ORDER BY PUBLISH_TIME ASC" # 是否将字段名转换为小写,默认true(如果有数据序列化、反序列化需求,建议改为false); lowercase_column_names => false # Value can be any of: fatal,error,warn,info,debug,默认info; sql_log_level => warn # 是否记录上次执行结果,true表示会将上次执行结果的tracking_column字段的值保存到last_run_metadata_path指定的文件中; record_last_run => true # 需要记录查询结果某字段的值时,此字段为true,否则默认tracking_column为timestamp的值; use_column_value => true # 需要记录的字段,用于增量同步,需是数据库字段 tracking_column => "PUBLISH_TIME" # Value can be any of: numeric,timestamp,Default value is "numeric" tracking_column_type => timestamp # record_last_run上次数据存放位置; last_run_metadata_path => "mysql/audit_basis_last_id.txt" # 是否清除last_run_metadata_path的记录,需要增量同步时此字段必须为false; clean_run => false # 同步频率(分 时 天 月 年),默认每分钟同步一次; schedule => "* * * * *" } } filter { json { source => "message" remove_field => ["message"] } # 去除自动添加的 "@timestamp", "@version" 字段 # 严禁过滤 "type" 字段,会导致无法成功同步数据 mutate { remove_field => ["@timestamp", "@version"] } } output { # type 值需与 jdbc 中的 type 值相对应 if [type] == "audit_law" { elasticsearch { # 配置ES集群地址 hosts => ["192.168.0.62:9200"] # 索引名字,必须小写 index => "audit_law" # 数据唯一索引(建议使用数据库KeyID) document_id => "%{ID}" template_overwrite => true template => "mysql/audit_law_mapping.json" } } if [type] == "audit_basis" { elasticsearch { # 配置ES集群地址 hosts => ["192.168.0.62:9200"] # 索引名字,必须小写 index => "audit_basis" # 数据唯一索引(建议使用数据库KeyID) document_id => "%{ID}" template_overwrite => true template => "mysql/audit_basis_mapping.json" } } stdout { codec => json_lines } }
|
自定义mapping文件示例
{ "settings": { "number_of_shards" : 1, "number_of_replicas" : 0, "analysis.char_filter":["html_strip"] }, "mappings": { "properties": { "ID":{ "type":"keyword" }, "TYPE":{ "type":"keyword" }, "NAME":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, "DOCUMENT_NO":{ "type":"keyword" }, "INDUSTRY_NAME":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, "REGION_NAME":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, "CONTENT":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, "PUBLISH_TIME":{ "type":"date" } } } }
|