建模
注意tags的类型是text,且分词是按空格分词的,也就是说默认不能聚合,但是给他设置了filedata:true属性,那么他就可以进行聚合了。 警告:fliedata和doc value不同,filedata是加载在jvm 中的,所以需要注意OOM tips: analyzer分析的词是没有doc values的,所以是不能聚合的
PUT /shop
{
"settings": {
"number_of_replicas": 1,
"number_of_shards": 1
},
"mappings": {
"properties": {
"id": {
"type": "integer"
},
"name": {
"type": "text",
"analyzer": "ik_max_word", //分词使用最大化分词
"search_analyzer": "ik_smart" //
},
"tags": {
"type": "text",
"analyzer": "whitespace", #注意是以空格分词
"fielddata": true #可以在内存进行聚合
},
"location": {
"type": "geo_point"
},
"remark_score": {
"type": "double"
},
"price_per_man": {
"type": "integer"
},
"category_id": {
"type": "integer"
},
"category_name": {
"type": "keyword"
},
"seller_id": {
"type": "integer"
},
"seller_remark_score": {
"type": "double"
},
"seller_disabled_flag": {
"type": "integer"
}
}
}
}
搜索模型
# 搜索模型1,酒店名称作为召回策略,按照距离排序
GET shop/_search
{
"query": {
"match": {
"name": "凯悦"
}
},
"_source": "*",
"script_fields": {
"distance": {
"script": {
"source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
"lang": "expression",
"params": {
"lat": 31.49345,
"lon": 121.65671
}
}
}
},
"sort": [
{
"_geo_distance": {
"location": {
"lat": 31.49345,
"lon": 121.65671
},
"order": "asc",
"unit": "km",
"distance_type": "arc"
}
}
]
}
自定义评分
# 使用function_score解决算分排序问题,高斯衰减函数
# 必须满足文本凯悦,但是打分权重不高
# 按照LBS的特性,距离是主要因素
GET shop/_search
{
"_source": "*",
"script_fields": {
"distance": {
"script": {
"source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
"lang": "expression",
"params": {
"lat": 31.49345,
"lon": 121.65671
}
}
}
},
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"match": {
"name":{
"query": "凯悦",
"boost": 0.1
}
}
},
{
"term": {
"seller_disabled_flag": {
"value": "0"
}
}
}
]
}
},
"functions": [
{
"gauss": {
"location": {
"origin": "31.49345,121.65671",
"scale": "100km",
"offset": "0km",
"decay": 0.5
}
},
"weight": 9
},
{
"field_value_factor": {
"field": "remark_score"
},
"weight": 0.2
},
{
"field_value_factor": {
"field": "seller_remark_score"
},
"weight": 0.1
}
],
"score_mode": "sum",
"boost_mode": "sum"
}
} ,
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}
价格搜索模型
按价格排序,我们的关键字召回不进行打分,只要包含关键字即可,按价格进行排序:“boost_mode”: “replace”
#按价格排序
GET shop/_search
{
"_source": "*",
"script_fields": {
"distance": {
"script": {
"source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
"lang": "expression",
"params": {
"lat": 31.49345,
"lon": 121.65671
}
}
}
},
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"match": {
"name": {
"query": "凯悦",
"boost": 0.1
}
}
},
{
"term": {
"seller_disabled_flag": {
"value": "0"
}
}
}
]
}
},
"functions": [
{
"field_value_factor": {
"field": "price_per_man"
},
"weight": 1
}
],
"score_mode": "sum",
"boost_mode": "replace"
}
},
"sort": [
{
"_score": {
"order": "asc"
}
}
]
}
聚合
filedata字段的聚合
GET shop/_search
{
"_source": "*",
"script_fields": {
"distance": {
"script": {
"source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
"lang": "expression",
"params": {
"lat": 31.49345,
"lon": 121.65671
}
}
}
},
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"match": {
"name": {
"query": "凯悦",
"boost": 0.1
}
}
},
{
"term": {
"seller_disabled_flag": {
"value": "0"
}
}
}
]
}
},
"functions": [
{
"field_value_factor": {
"field": "price_per_man"
},
"weight": 1
}
],
"score_mode": "sum",
"boost_mode": "replace"
}
},
"sort": [
{
"_score": {
"order": "asc"
}
}
],
"aggs": {
"aggByTag": {
"terms": {
"field": "tags",
"size": 10
}
}
}
}
词库扩展
例如我们可以用nginx来暴露出文件地址来实现远程词库扩展。
相关性扩展
例如我们搜索同义词住宿的时候,我们需要搜到酒店类别的数据
GET shop/_search
{
"_source": "*",
"script_fields": {
"distance": {
"script": {
"source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
"lang": "expression",
"params": {
"lat": 31.49345,
"lon": 121.65671
}
}
}
},
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"match": {
"name": {
"query": "住宿",
"boost": 0.1
}
}
},
{
"term": {
"category_id": {
"value": "2",
"boost": 0
}
}
}
]
}
},
{
"term": {
"seller_disabled_flag": {
"value": "0"
}
}
}
]
}
},
"functions": [
{
"field_value_factor": {
"field": "price_per_man"
},
"weight": 1
}
],
"score_mode": "sum",
"boost_mode": "replace"
}
},
"sort": [
{
"_score": {
"order": "asc"
}
}
],
"aggs": {
"aggByTag": {
"terms": {
"field": "tags",
"size": 10
}
}
}
}
同义词门店索引结构
PUT /shop
{
"settings": {
"number_of_replicas": 1,
"number_of_shards": 3,
"analysis": {
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms_path": "analysis-ik/synonyms.txt"
}
},
"analyzer": {
"ik_syno": {
"type": "custom",
"tokenizer": "ik_smart",
"filter": [
"my_synonym_filter"
]
},
"ik_syno_max": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter": [
"my_synonym_filter"
]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "integer"
},
"name": {
"type": "text",
"analyzer": "ik_syno",
"search_analyzer": "ik_syno_max"
},
"tags": {
"type": "text",
"analyzer": "whitespace",
"fielddata": true
},
"location": {
"type": "geo_point"
},
"remark_score": {
"type": "double"
},
"price_per_man": {
"type": "integer"
},
"category_id": {
"type": "integer"
},
"category_name": {
"type": "keyword"
},
"seller_id": {
"type": "integer"
},
"seller_remark_score": {
"type": "double"
},
"seller_disabled_flag": {
"type": "integer"
}
}
}
}
logstash全量复制
配置文件
input {
stdin{
}
jdbc {
# 连接的数据库地址和数据库,指定编码格式,禁用SSL协议,设定自动重连
jdbc_connection_string => "jdbc:mysql://192.168.48.118:3306/dianpingdb?characterEncoding=UTF-8&useSSL=false&autoReconnect=true"
# 用户名密码
jdbc_user => "root"
jdbc_password => "123456"
# jar包的位置
jdbc_driver_library => "/usr/local/logstash-7.1.1/logstash-core/lib/jars/mysql-connector-java-5.1.20.jar"
# mysql的Driver
jdbc_driver_class => "com.mysql.jdbc.Driver"
jdbc_default_timezone => "Asia/Shanghai"
jdbc_paging_enabled => "true"
jdbc_page_size => "1000"
statement_filepath => "/usr/local/logstash-7.1.1/config/sql/shop.sql"
#statement => "select"
#schedule => "* * * * *"
clean_run => true
}
}
output {
stdout {
codec => json_lines
}
elasticsearch {
hosts => "192.168.48.94:9200"
# index名
index => "shop"
# 关联id
document_id => "%{id}"
}
stdout {
codec => json_lines
}
}
sql文件
select a.id,a.name,a.tags,concat(a.latitude,',',a.longitude) as location,a.remark_score,a.price_per_man,a.category_id,b.name as category_name,a.seller_id,c.remark_score as seller_remark_score,c.disabled_flag as seller_disabled_flag from shop a inner join category b on a.category_id = b.id inner join seller c on c.id = a.seller_id
启动logstash
./bin/logstash -f config/shop.conf &
canal 增量复制
修改MySQL的my.cnf配置文件,增加以下内容
server-id = 1
binlog_format = ROW
log_bin = mysql_bin
重启MySQL,查看binlog是否开启
show VARIABLES like 'log_bin'
MySQL创建canal用户 ```bash create user canal identified by ‘canal’;
grant select, show view, replication slave, replication client on . to ‘canal’@’%’; flush privileges;
mysql8需要修改
select host,user,plugin from mysql.user ; ALTER USER ‘canal’@’%’ IDENTIFIED WITH mysql_native_password BY ‘canal’;
4. 安装canal-server,并修改配置文件
参考地址:[安装 canal-server](https://link.juejin.cn/?target=https%3A%2F%2Fblog.csdn.net%2Fzhenghongcs%2Farticle%2Fdetails%2F109476376)<br />需要下载 canal 的各个组件 canal-server、canal-adapter、canal-admin,下载地址:[github.com/alibaba/can…](https://link.juejin.cn/?target=https%3A%2F%2Fgithub.com%2Falibaba%2Fcanal%2Freleases)<br />首先安装canal-server,将下载好的压缩包 canal.deployer-1.1.5-SNAPSHOT.tar.gz 上传到Linux服务器,然后解压到指定目录/mydata/canal-server,可使用如下命令解压:
```bash
tar -zxvf canal.deployer-1.1.5-SNAPSHOT.tar.gz
# 然后
cd canal-server/conf/example/
vi instance.properties
修改配置文件
# 把0改成10,只要不和mysql的id相同就行
canal.instance.mysql.slaveId=10
# 修改成mysql对应的账号密码
canal.instance.master.address=192.168.1.6:3306
canal.instance.dbUsername=canal
canal.instance.dbPassword=canal
# 监听库的规则,如下就是监听dingpingdb库的所有表
canal.instance.filter.regex=dingpingdb\\..*
- 启动canal-server
可以参考这篇文章:https://blog.csdn.net/zhenghongcs/article/details/109476376# 启动服务
sh bin/startup.sh
# 停止服务
sh bin/stop.sh
# 重启服务
sh bin/restart.sh