如何实现
利用ES的suggest completion提示器进行关键词补全,completion提示器是由前缀树实现的,数据全部装载在内存中,速度极快。
现有的数据
自动补全实现
pv/uv 每个用户有几张有效数据 (系数 2)
pv/search_cnt 每次搜索有几个有效数据 (系数1)
search_cnt/uv 每个用户搜几次 (系数0.05)
no_results_pv/search_cnt 每次搜索有几个无效图 (系数-0.5 负反馈)
· 对七天内数据(PV UV search_cnt)进行加权融合 算出权重
· 存入ES
· 清洗敏感词,补全拼音
建立索引模板
PUT /_template/keyword_suggest
{
"order": 1,
"index_patterns": [
"*_keyword_suggest"
],
"settings": {
"analysis": {
"analyzer": {
"prefix_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"prefix_pinyin"
]
},
"full_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"full_pinyin"
]
}
},
"filter": {
"_pattern": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": [
"([0-9])",
"([a-z])"
]
},
"prefix_pinyin": {
"type": "pinyin",
"keep_first_letter": true,
"keep_full_pinyin": false,
"none_chinese_pinyin_tokenize": false,
"keep_original": false
},
"full_pinyin": {
"type": "pinyin",
"keep_first_letter": false,
"keep_full_pinyin": true,
"keep_original": false,
"keep_none_chinese_in_first_letter": false
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "keyword"
},
"suggestText": {
"type": "completion",
"analyzer": "standard",
"preserve_separators": false,
"preserve_position_increments": true,
"max_input_length": 50
},
"prefix_pinyin": {
"type": "completion",
"analyzer": "prefix_pinyin_analyzer",
"search_analyzer": "standard",
"preserve_separators": false
},
"full_pinyin": {
"type": "completion",
"analyzer": "full_pinyin_analyzer",
"search_analyzer": "full_pinyin_analyzer",
"preserve_separators": false
}
}
},
"aliases": {
"keyword_suggest": {}
}
}
通过pinyin分词器,只要全拼正确,即使有错别字也可以识别。
spark计算
spark代码涵盖的助手函数太多,不放代码了
思路为查出所有数据后使用
DataFrame = DataFrame.withColumn("weight", col =
(DataFrame("pv") / DataFrame("uv") * 2 +
DataFrame("pv") / DataFrame("search_cnt") +
DataFrame("search_cnt") / DataFrame("uv") * 0.05 -
DataFrame("no_results_pv") / DataFrame("search_cnt") * 0.5) * 3.5
)
清洗关键词及拼音补全
laravel定时任务,使用超哥的https://github.com/overtrue/pinyin 进行拼音补全
public function handle()
{
//内存型
$pinyin = new Pinyin('Overtrue\Pinyin\MemoryFileDictLoader');
$scroll_id = null;
KeywordSuggest::$search_index = str_replace('day', Carbon::yesterday()->format('Ymd'), KeywordSuggest::$search_index);
foreach (KeywordSuggest::scrollIndex([], 1000, null, null, $scroll_id) as $value)
{
list($suggests, $scroll_id) = $value;
$params = ['body' => []];
foreach ($suggests as $item)
{
$suggest = $item['_source'];
$id = array_get($suggest, 'id');
$suggest_text = array_get($suggest, 'suggestText.input.0');
$weight = array_get($suggest, 'suggestText.weight');
if($suggest_text && Video::getBadWord($suggest_text))
{
KeywordSuggest::deleteIndex($id, false);
continue;
}
if (!preg_match("/^[\x{4e00}-\x{9fa5}A-Za-z0-9]+$/u", $suggest_text))
{
KeywordSuggest::deleteIndex($id, false);
continue;
}
$full_pinyin = implode($pinyin->convert($suggest_text));
$prefix_pinyin = $pinyin->abbr($suggest_text);
$params['body'][] = [
'update' => [
'_index' => KeywordSuggest::$search_index,
'_type' => '_doc',
'_id' => $id
]
];
$params['body'][]['doc'] = [
'full_pinyin' => [
'input' => [$full_pinyin],
'weight' => $weight
],
'prefix_pinyin' => [
'input' => [$prefix_pinyin],
'weight' => $weight
],
];
}
KeywordSuggest::elasticSearchclient()->bulk($params);
unset($params);
$this->info($scroll_id);
}
$this->info('success');
}
使用
DSL如下
$body['suggest'] = [
"prefix_pinyin" => [
'prefix' => $keyword,
"completion" => [
'field' => 'prefix_pinyin',
'skip_duplicates' => true
]
],
"full_pinyin" => [
'prefix' => $keyword,
"completion" => [
'field' => 'full_pinyin',
'skip_duplicates' => true
]
],
"suggestText" => [
'prefix' => $keyword,
"completion" => [
'field' => 'suggestText',
'skip_duplicates' => true
]
],
];
$body['_source']['includes'] = 'suggestText';