建模

注意tags的类型是text,且分词是按空格分词的,也就是说默认不能聚合,但是给他设置了filedata:true属性,那么他就可以进行聚合了。 警告:fliedata和doc value不同,filedata是加载在jvm 中的,所以需要注意OOM tips: analyzer分析的词是没有doc values的,所以是不能聚合的

  1. PUT /shop
  2. {
  3. "settings": {
  4. "number_of_replicas": 1,
  5. "number_of_shards": 1
  6. },
  7. "mappings": {
  8. "properties": {
  9. "id": {
  10. "type": "integer"
  11. },
  12. "name": {
  13. "type": "text",
  14. "analyzer": "ik_max_word", //分词使用最大化分词
  15. "search_analyzer": "ik_smart" //
  16. },
  17. "tags": {
  18. "type": "text",
  19. "analyzer": "whitespace", #注意是以空格分词
  20. "fielddata": true #可以在内存进行聚合
  21. },
  22. "location": {
  23. "type": "geo_point"
  24. },
  25. "remark_score": {
  26. "type": "double"
  27. },
  28. "price_per_man": {
  29. "type": "integer"
  30. },
  31. "category_id": {
  32. "type": "integer"
  33. },
  34. "category_name": {
  35. "type": "keyword"
  36. },
  37. "seller_id": {
  38. "type": "integer"
  39. },
  40. "seller_remark_score": {
  41. "type": "double"
  42. },
  43. "seller_disabled_flag": {
  44. "type": "integer"
  45. }
  46. }
  47. }
  48. }

搜索模型

  1. # 搜索模型1,酒店名称作为召回策略,按照距离排序
  2. GET shop/_search
  3. {
  4. "query": {
  5. "match": {
  6. "name": "凯悦"
  7. }
  8. },
  9. "_source": "*",
  10. "script_fields": {
  11. "distance": {
  12. "script": {
  13. "source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
  14. "lang": "expression",
  15. "params": {
  16. "lat": 31.49345,
  17. "lon": 121.65671
  18. }
  19. }
  20. }
  21. },
  22. "sort": [
  23. {
  24. "_geo_distance": {
  25. "location": {
  26. "lat": 31.49345,
  27. "lon": 121.65671
  28. },
  29. "order": "asc",
  30. "unit": "km",
  31. "distance_type": "arc"
  32. }
  33. }
  34. ]
  35. }

自定义评分

  1. # 使用function_score解决算分排序问题,高斯衰减函数
  2. # 必须满足文本凯悦,但是打分权重不高
  3. # 按照LBS的特性,距离是主要因素
  4. GET shop/_search
  5. {
  6. "_source": "*",
  7. "script_fields": {
  8. "distance": {
  9. "script": {
  10. "source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
  11. "lang": "expression",
  12. "params": {
  13. "lat": 31.49345,
  14. "lon": 121.65671
  15. }
  16. }
  17. }
  18. },
  19. "query": {
  20. "function_score": {
  21. "query": {
  22. "bool": {
  23. "must": [
  24. {
  25. "match": {
  26. "name":{
  27. "query": "凯悦",
  28. "boost": 0.1
  29. }
  30. }
  31. },
  32. {
  33. "term": {
  34. "seller_disabled_flag": {
  35. "value": "0"
  36. }
  37. }
  38. }
  39. ]
  40. }
  41. },
  42. "functions": [
  43. {
  44. "gauss": {
  45. "location": {
  46. "origin": "31.49345,121.65671",
  47. "scale": "100km",
  48. "offset": "0km",
  49. "decay": 0.5
  50. }
  51. },
  52. "weight": 9
  53. },
  54. {
  55. "field_value_factor": {
  56. "field": "remark_score"
  57. },
  58. "weight": 0.2
  59. },
  60. {
  61. "field_value_factor": {
  62. "field": "seller_remark_score"
  63. },
  64. "weight": 0.1
  65. }
  66. ],
  67. "score_mode": "sum",
  68. "boost_mode": "sum"
  69. }
  70. } ,
  71. "sort": [
  72. {
  73. "_score": {
  74. "order": "desc"
  75. }
  76. }
  77. ]
  78. }

价格搜索模型

按价格排序,我们的关键字召回不进行打分,只要包含关键字即可,按价格进行排序:“boost_mode”: “replace”

  1. #按价格排序
  2. GET shop/_search
  3. {
  4. "_source": "*",
  5. "script_fields": {
  6. "distance": {
  7. "script": {
  8. "source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
  9. "lang": "expression",
  10. "params": {
  11. "lat": 31.49345,
  12. "lon": 121.65671
  13. }
  14. }
  15. }
  16. },
  17. "query": {
  18. "function_score": {
  19. "query": {
  20. "bool": {
  21. "must": [
  22. {
  23. "match": {
  24. "name": {
  25. "query": "凯悦",
  26. "boost": 0.1
  27. }
  28. }
  29. },
  30. {
  31. "term": {
  32. "seller_disabled_flag": {
  33. "value": "0"
  34. }
  35. }
  36. }
  37. ]
  38. }
  39. },
  40. "functions": [
  41. {
  42. "field_value_factor": {
  43. "field": "price_per_man"
  44. },
  45. "weight": 1
  46. }
  47. ],
  48. "score_mode": "sum",
  49. "boost_mode": "replace"
  50. }
  51. },
  52. "sort": [
  53. {
  54. "_score": {
  55. "order": "asc"
  56. }
  57. }
  58. ]
  59. }

聚合

filedata字段的聚合

  1. GET shop/_search
  2. {
  3. "_source": "*",
  4. "script_fields": {
  5. "distance": {
  6. "script": {
  7. "source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
  8. "lang": "expression",
  9. "params": {
  10. "lat": 31.49345,
  11. "lon": 121.65671
  12. }
  13. }
  14. }
  15. },
  16. "query": {
  17. "function_score": {
  18. "query": {
  19. "bool": {
  20. "must": [
  21. {
  22. "match": {
  23. "name": {
  24. "query": "凯悦",
  25. "boost": 0.1
  26. }
  27. }
  28. },
  29. {
  30. "term": {
  31. "seller_disabled_flag": {
  32. "value": "0"
  33. }
  34. }
  35. }
  36. ]
  37. }
  38. },
  39. "functions": [
  40. {
  41. "field_value_factor": {
  42. "field": "price_per_man"
  43. },
  44. "weight": 1
  45. }
  46. ],
  47. "score_mode": "sum",
  48. "boost_mode": "replace"
  49. }
  50. },
  51. "sort": [
  52. {
  53. "_score": {
  54. "order": "asc"
  55. }
  56. }
  57. ],
  58. "aggs": {
  59. "aggByTag": {
  60. "terms": {
  61. "field": "tags",
  62. "size": 10
  63. }
  64. }
  65. }
  66. }

词库扩展

http://yoursite.com/getCustomDict
例如我们可以用nginx来暴露出文件地址来实现远程词库扩展。

相关性扩展

例如我们搜索同义词住宿的时候,我们需要搜到酒店类别的数据

  1. GET shop/_search
  2. {
  3. "_source": "*",
  4. "script_fields": {
  5. "distance": {
  6. "script": {
  7. "source": "haversin(lat,lon,doc['location'].lat,doc['location'].lon)",
  8. "lang": "expression",
  9. "params": {
  10. "lat": 31.49345,
  11. "lon": 121.65671
  12. }
  13. }
  14. }
  15. },
  16. "query": {
  17. "function_score": {
  18. "query": {
  19. "bool": {
  20. "must": [
  21. {
  22. "bool": {
  23. "should": [
  24. {
  25. "match": {
  26. "name": {
  27. "query": "住宿",
  28. "boost": 0.1
  29. }
  30. }
  31. },
  32. {
  33. "term": {
  34. "category_id": {
  35. "value": "2",
  36. "boost": 0
  37. }
  38. }
  39. }
  40. ]
  41. }
  42. },
  43. {
  44. "term": {
  45. "seller_disabled_flag": {
  46. "value": "0"
  47. }
  48. }
  49. }
  50. ]
  51. }
  52. },
  53. "functions": [
  54. {
  55. "field_value_factor": {
  56. "field": "price_per_man"
  57. },
  58. "weight": 1
  59. }
  60. ],
  61. "score_mode": "sum",
  62. "boost_mode": "replace"
  63. }
  64. },
  65. "sort": [
  66. {
  67. "_score": {
  68. "order": "asc"
  69. }
  70. }
  71. ],
  72. "aggs": {
  73. "aggByTag": {
  74. "terms": {
  75. "field": "tags",
  76. "size": 10
  77. }
  78. }
  79. }
  80. }

同义词门店索引结构

  1. PUT /shop
  2. {
  3. "settings": {
  4. "number_of_replicas": 1,
  5. "number_of_shards": 3,
  6. "analysis": {
  7. "filter": {
  8. "my_synonym_filter": {
  9. "type": "synonym",
  10. "synonyms_path": "analysis-ik/synonyms.txt"
  11. }
  12. },
  13. "analyzer": {
  14. "ik_syno": {
  15. "type": "custom",
  16. "tokenizer": "ik_smart",
  17. "filter": [
  18. "my_synonym_filter"
  19. ]
  20. },
  21. "ik_syno_max": {
  22. "type": "custom",
  23. "tokenizer": "ik_max_word",
  24. "filter": [
  25. "my_synonym_filter"
  26. ]
  27. }
  28. }
  29. }
  30. },
  31. "mappings": {
  32. "properties": {
  33. "id": {
  34. "type": "integer"
  35. },
  36. "name": {
  37. "type": "text",
  38. "analyzer": "ik_syno",
  39. "search_analyzer": "ik_syno_max"
  40. },
  41. "tags": {
  42. "type": "text",
  43. "analyzer": "whitespace",
  44. "fielddata": true
  45. },
  46. "location": {
  47. "type": "geo_point"
  48. },
  49. "remark_score": {
  50. "type": "double"
  51. },
  52. "price_per_man": {
  53. "type": "integer"
  54. },
  55. "category_id": {
  56. "type": "integer"
  57. },
  58. "category_name": {
  59. "type": "keyword"
  60. },
  61. "seller_id": {
  62. "type": "integer"
  63. },
  64. "seller_remark_score": {
  65. "type": "double"
  66. },
  67. "seller_disabled_flag": {
  68. "type": "integer"
  69. }
  70. }
  71. }
  72. }

logstash全量复制

配置文件

  1. input {
  2. stdin{
  3. }
  4. jdbc {
  5. # 连接的数据库地址和数据库,指定编码格式,禁用SSL协议,设定自动重连
  6. jdbc_connection_string => "jdbc:mysql://192.168.48.118:3306/dianpingdb?characterEncoding=UTF-8&useSSL=false&autoReconnect=true"
  7. # 用户名密码
  8. jdbc_user => "root"
  9. jdbc_password => "123456"
  10. # jar包的位置
  11. jdbc_driver_library => "/usr/local/logstash-7.1.1/logstash-core/lib/jars/mysql-connector-java-5.1.20.jar"
  12. # mysqlDriver
  13. jdbc_driver_class => "com.mysql.jdbc.Driver"
  14. jdbc_default_timezone => "Asia/Shanghai"
  15. jdbc_paging_enabled => "true"
  16. jdbc_page_size => "1000"
  17. statement_filepath => "/usr/local/logstash-7.1.1/config/sql/shop.sql"
  18. #statement => "select"
  19. #schedule => "* * * * *"
  20. clean_run => true
  21. }
  22. }
  23. output {
  24. stdout {
  25. codec => json_lines
  26. }
  27. elasticsearch {
  28. hosts => "192.168.48.94:9200"
  29. # index
  30. index => "shop"
  31. # 关联id
  32. document_id => "%{id}"
  33. }
  34. stdout {
  35. codec => json_lines
  36. }
  37. }

sql文件

  1. select a.id,a.name,a.tags,concat(a.latitude,',',a.longitude) as location,a.remark_score,a.price_per_man,a.category_id,b.name as category_name,a.seller_id,c.remark_score as seller_remark_score,c.disabled_flag as seller_disabled_flag from shop a inner join category b on a.category_id = b.id inner join seller c on c.id = a.seller_id

启动logstash

  1. ./bin/logstash -f config/shop.conf &

canal 增量复制

  1. 修改MySQL的my.cnf配置文件,增加以下内容

    1. server-id = 1
    2. binlog_format = ROW
    3. log_bin = mysql_bin
  2. 重启MySQL,查看binlog是否开启

    1. show VARIABLES like 'log_bin'
  3. MySQL创建canal用户 ```bash create user canal identified by ‘canal’;

grant select, show view, replication slave, replication client on . to ‘canal’@’%’; flush privileges;

mysql8需要修改

select host,user,plugin from mysql.user ; ALTER USER ‘canal’@’%’ IDENTIFIED WITH mysql_native_password BY ‘canal’;

  1. 4. 安装canal-server,并修改配置文件
  2. 参考地址:[安装 canal-server](https://link.juejin.cn/?target=https%3A%2F%2Fblog.csdn.net%2Fzhenghongcs%2Farticle%2Fdetails%2F109476376)<br />需要下载 canal 的各个组件 canal-server、canal-adapter、canal-admin,下载地址:[github.com/alibaba/can…](https://link.juejin.cn/?target=https%3A%2F%2Fgithub.com%2Falibaba%2Fcanal%2Freleases)<br />首先安装canal-server,将下载好的压缩包 canal.deployer-1.1.5-SNAPSHOT.tar.gz 上传到Linux服务器,然后解压到指定目录/mydata/canal-server,可使用如下命令解压:
  3. ```bash
  4. tar -zxvf canal.deployer-1.1.5-SNAPSHOT.tar.gz
  5. # 然后
  6. cd canal-server/conf/example/
  7. vi instance.properties

修改配置文件

  1. # 把0改成10,只要不和mysql的id相同就行
  2. canal.instance.mysql.slaveId=10
  3. # 修改成mysql对应的账号密码
  4. canal.instance.master.address=192.168.1.6:3306
  5. canal.instance.dbUsername=canal
  6. canal.instance.dbPassword=canal
  7. # 监听库的规则,如下就是监听dingpingdb库的所有表
  8. canal.instance.filter.regex=dingpingdb\\..*
  1. 启动canal-server
    1. # 启动服务
    2. sh bin/startup.sh
    3. # 停止服务
    4. sh bin/stop.sh
    5. # 重启服务
    6. sh bin/restart.sh
    可以参考这篇文章:https://blog.csdn.net/zhenghongcs/article/details/109476376