准备切割字符串的函数
drop function if exists fn_split_text_to_array(str text);
create or replace function fn_split_text_to_array(str text)
returns text [] as $fn$
-- 将字符串按两个两个字符拆分
declare
i int = 1; -- int需要给默认值
ret text []; -- 数组不用给默认值,
strlen int = length(str);
substr text;
begin
case strlen
-- 长度小于2不做处理
WHEN 0, 1
then
-- 长度为直接拼接
WHEN 2
then
ret := ret || str;
-- 长度大于2每两个字符拆解
else
loop
exit when i > strlen; -- while循环,退出条件
--raise notice 'i: % length(str): % substr(str, i, 2): %', i, strlen,substr(str, i, 2);
substr = substr(str, i, 2);
if (length(substr) > 1)
then
ret := ret || substr;
end if;
i := i + 1; -- 循环退出标记
end loop;
end case;
return ret;
end;
$fn$
language plpgsql;
-- select fn_split_text_to_array('hello, my name''s henry morgan, 你出戏了');
-- select fn_split_text_to_array(content), content from t1 where id = 6443611;
drop function if exists str_to_tsvector(str text);
create or replace function str_to_tsvector(str text) returns tsvector
language plpgsql
as
$$
declare
ret text [] = '{}' :: text [];
item text;
begin
-- 过滤中英文特殊符号, 替换成空格, 英文符号中去除了单引号, 因为英文单词常用, 去除了@符号, 因为邮箱常用, 去除了小数点,因为数字常用
str = regexp_replace(str,
'[ ·{}()\[\]\r\n^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏]', ' ', 'g');
-- 先按空格切分
for item in select unnest(regexp_split_to_array(str, ' '))
loop
-- 判空
if (length(trim(item)) < 2)
then
else
if (item ~ '(^[\d]{1,}]$)|(^[\d%]{1,}]$)|(^[\d.]*$)|(^[A-Za-z]{1,}$)|(^[A-Za-z'']{1,}$)|(^[A-Za-z0-9_@.]{1,}$)')
then
--raise notice 'item: %', item;
--raise notice 'fn_split_text_to_array(item): %', fn_split_text_to_array(item);
-- -- 纯数字, 纯英文, i'm, 邮箱, 直接拼接
ret = ret || item;
-- 另外处理小数点和@符号
ret = ret || regexp_split_to_array(item,'[.@]');
else
-- 提取出两位以上不包括两位的连续字母和数字
-- 汉字按两两字符拆分
ret = ret || fn_split_text_to_array(item);
end if;
end if;
end loop;
--raise notice 'ret: % ', ret;
return array_to_tsvector(ret);
end
$$;
select array_to_tsvector(str_segment('去你妈 的把'));
drop function if exists str_segment;
create or replace function str_segment(text) returns text[] immutable as $$
declare
-- 提取连续数字英文单词, 连续特殊符号组合, 及三组合
reg1 text = '([a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏“”‘’——]{2,})';
-- 提取连续数字英文符号
reg2 text = '([a-zA-Z0-9''_$#%^@\\]{2,})';
-- 提取转移符号
reg3 text = '([\\][a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏]{1})';
-- 提取数字字母组合
reg4 text = '([(a-zA-Z0-9)]{2,})';
-- 提取类邮箱@qq.com, @gmail.com
reg5 text = '([a-zA-Z''_$#%^@\\.]{2,})';
-- 提取类邮箱@163.com
reg6 text = '([0-9''_$#%^@\\.]{2,})';
arr1 text[];
arr2 text[];
str text;
begin
$1 = regexp_replace($1,'[\r\n]','g');
select array_agg(vals[1]) into arr2 from regexp_matches($1,reg1,'g') as vals;
arr1 = arr1 || arr2;
select array_agg(vals[1]) into arr2 from regexp_matches($1,reg2,'g') as vals;
arr1 = arr1 || arr2;
select array_agg(vals[1]) into arr2 from regexp_matches($1,reg3,'g') as vals;
arr1 = arr1 || arr2;
select array_agg(vals[1]) into arr2 from regexp_matches($1,reg4,'g') as vals;
arr1 = arr1 || arr2;
select array_agg(vals[1]) into arr2 from regexp_matches($1,reg5,'g') as vals;
arr1 = arr1 || arr2;
select array_agg(vals[1]) into arr2 from regexp_matches($1,reg6,'g') as vals;
arr1 = arr1 || arr2;
-- 按1 2 4 号正则拆分, 基本上可以提取出符号 数字 字母之外的语言, 汉字 韩文 等
for str in select unnest(regexp_split_to_array($1,reg1))
loop
arr1 = arr1 || fn_split_text_to_array(str);
end loop;
for str in select unnest(regexp_split_to_array($1,reg2))
loop
arr1 = arr1 || fn_split_text_to_array(str);
end loop;
for str in select unnest(regexp_split_to_array($1,reg4))
loop
arr1 = arr1 || fn_split_text_to_array(str);
end loop;
return arr1;
end;
$$ language plpgsql;
准备测试数据
create table t_test_rum(
id serial8 primary key ,
txt text not null
);
insert into t_test_rum(txt)
select concat(t_lastname.value,t_firstname.value) from t_lastname inner join t_firstname on true;
添加rum索引
create index rumidx on t_test_rum using rum (array_to_tsvector(str_segment(txt)) rum_tsvector_ops);
测试
-- 很快出结果
select * from t_test_rum where id > 10 limit 10;;
-- 一共两条
select count(1),array_agg(id) from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月';
-- 但你偏要查询3条
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;
-- 来看下执行计划
explain (analyze,verbose,timing,costs,buffers)
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;
/*
------------------------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=0.42..191.36 rows=3 width=40) (actual time=35.241..35393.717 rows=2 loops=1)
Output: id, txt
Buffers: shared hit=2443
-> Index Scan using t_test_rum_pkey on public.t_test_rum (cost=0.42..64475.62 rows=1013 width=40) (actual time=35.236..35393.702 rows=2 loops=1)
Output: id, txt
Filter: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
Rows Removed by Filter: 275449
Buffers: shared hit=2443
Planning Time: 0.133 ms
Execution Time: 35393.743 ms
(10 rows)
*/
/*
--------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=1826.70..1826.71 rows=1 width=40) (actual time=0.052..0.054 rows=1 loops=1)
Output: count(1), array_agg(id)
Buffers: shared hit=5
-> Bitmap Heap Scan on public.t_test_rum (cost=23.85..1821.64 rows=1013 width=8) (actual time=0.032..0.037 rows=2 loops=1)
Output: id, txt
Recheck Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
Heap Blocks: exact=2
Buffers: shared hit=5
-> Bitmap Index Scan on rumidx (cost=0.00..23.60 rows=1013 width=0) (actual time=0.024..0.024 rows=2 loops=1)
Index Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
Buffers: shared hit=3
Planning Time: 0.147 ms
Execution Time: 0.113 ms
(13 rows)
*/
-- 优化写法
with cte as (select id from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月')
select * from t_test_rum inner join cte on cte.id = t_test_rum.id limit 10;
-- 当然这也不是最终写法, 当真实数据量上去之后, 也许会不同