准备切割字符串的函数
drop function if exists fn_split_text_to_array(str text);create or replace function fn_split_text_to_array(str text) returns text [] as $fn$-- 将字符串按两个两个字符拆分declare i int = 1; -- int需要给默认值 ret text []; -- 数组不用给默认值, strlen int = length(str); substr text;begin case strlen-- 长度小于2不做处理 WHEN 0, 1 then-- 长度为直接拼接 WHEN 2 then ret := ret || str;-- 长度大于2每两个字符拆解 else loop exit when i > strlen; -- while循环,退出条件--raise notice 'i: % length(str): % substr(str, i, 2): %', i, strlen,substr(str, i, 2); substr = substr(str, i, 2); if (length(substr) > 1) then ret := ret || substr; end if; i := i + 1; -- 循环退出标记 end loop; end case; return ret;end;$fn$ language plpgsql;-- select fn_split_text_to_array('hello, my name''s henry morgan, 你出戏了');-- select fn_split_text_to_array(content), content from t1 where id = 6443611;drop function if exists str_to_tsvector(str text);create or replace function str_to_tsvector(str text) returns tsvector language plpgsqlas$$declare ret text [] = '{}' :: text []; item text;begin -- 过滤中英文特殊符号, 替换成空格, 英文符号中去除了单引号, 因为英文单词常用, 去除了@符号, 因为邮箱常用, 去除了小数点,因为数字常用 str = regexp_replace(str, '[ ·{}()\[\]\r\n^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏]', ' ', 'g'); -- 先按空格切分 for item in select unnest(regexp_split_to_array(str, ' ')) loop -- 判空 if (length(trim(item)) < 2) then else if (item ~ '(^[\d]{1,}]$)|(^[\d%]{1,}]$)|(^[\d.]*$)|(^[A-Za-z]{1,}$)|(^[A-Za-z'']{1,}$)|(^[A-Za-z0-9_@.]{1,}$)') then --raise notice 'item: %', item; --raise notice 'fn_split_text_to_array(item): %', fn_split_text_to_array(item);-- -- 纯数字, 纯英文, i'm, 邮箱, 直接拼接 ret = ret || item;-- 另外处理小数点和@符号 ret = ret || regexp_split_to_array(item,'[.@]'); else -- 提取出两位以上不包括两位的连续字母和数字-- 汉字按两两字符拆分 ret = ret || fn_split_text_to_array(item); end if; end if; end loop; --raise notice 'ret: % ', ret; return array_to_tsvector(ret);end$$;select array_to_tsvector(str_segment('去你妈 的把'));drop function if exists str_segment;create or replace function str_segment(text) returns text[] immutable as $$declare -- 提取连续数字英文单词, 连续特殊符号组合, 及三组合 reg1 text = '([a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏“”‘’——]{2,})'; -- 提取连续数字英文符号 reg2 text = '([a-zA-Z0-9''_$#%^@\\]{2,})'; -- 提取转移符号 reg3 text = '([\\][a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏]{1})'; -- 提取数字字母组合 reg4 text = '([(a-zA-Z0-9)]{2,})'; -- 提取类邮箱@qq.com, @gmail.com reg5 text = '([a-zA-Z''_$#%^@\\.]{2,})'; -- 提取类邮箱@163.com reg6 text = '([0-9''_$#%^@\\.]{2,})'; arr1 text[]; arr2 text[]; str text;begin $1 = regexp_replace($1,'[\r\n]','g'); select array_agg(vals[1]) into arr2 from regexp_matches($1,reg1,'g') as vals; arr1 = arr1 || arr2; select array_agg(vals[1]) into arr2 from regexp_matches($1,reg2,'g') as vals; arr1 = arr1 || arr2; select array_agg(vals[1]) into arr2 from regexp_matches($1,reg3,'g') as vals; arr1 = arr1 || arr2; select array_agg(vals[1]) into arr2 from regexp_matches($1,reg4,'g') as vals; arr1 = arr1 || arr2; select array_agg(vals[1]) into arr2 from regexp_matches($1,reg5,'g') as vals; arr1 = arr1 || arr2; select array_agg(vals[1]) into arr2 from regexp_matches($1,reg6,'g') as vals; arr1 = arr1 || arr2; -- 按1 2 4 号正则拆分, 基本上可以提取出符号 数字 字母之外的语言, 汉字 韩文 等 for str in select unnest(regexp_split_to_array($1,reg1)) loop arr1 = arr1 || fn_split_text_to_array(str); end loop; for str in select unnest(regexp_split_to_array($1,reg2)) loop arr1 = arr1 || fn_split_text_to_array(str); end loop; for str in select unnest(regexp_split_to_array($1,reg4)) loop arr1 = arr1 || fn_split_text_to_array(str); end loop; return arr1;end;$$ language plpgsql;
准备测试数据
create table t_test_rum(
id serial8 primary key ,
txt text not null
);
insert into t_test_rum(txt)
select concat(t_lastname.value,t_firstname.value) from t_lastname inner join t_firstname on true;
添加rum索引
create index rumidx on t_test_rum using rum (array_to_tsvector(str_segment(txt)) rum_tsvector_ops);
测试
-- 很快出结果
select * from t_test_rum where id > 10 limit 10;;
-- 一共两条
select count(1),array_agg(id) from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月';
-- 但你偏要查询3条
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;
-- 来看下执行计划
explain (analyze,verbose,timing,costs,buffers)
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;
/*
------------------------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=0.42..191.36 rows=3 width=40) (actual time=35.241..35393.717 rows=2 loops=1)
Output: id, txt
Buffers: shared hit=2443
-> Index Scan using t_test_rum_pkey on public.t_test_rum (cost=0.42..64475.62 rows=1013 width=40) (actual time=35.236..35393.702 rows=2 loops=1)
Output: id, txt
Filter: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
Rows Removed by Filter: 275449
Buffers: shared hit=2443
Planning Time: 0.133 ms
Execution Time: 35393.743 ms
(10 rows)
*/
/*
--------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=1826.70..1826.71 rows=1 width=40) (actual time=0.052..0.054 rows=1 loops=1)
Output: count(1), array_agg(id)
Buffers: shared hit=5
-> Bitmap Heap Scan on public.t_test_rum (cost=23.85..1821.64 rows=1013 width=8) (actual time=0.032..0.037 rows=2 loops=1)
Output: id, txt
Recheck Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
Heap Blocks: exact=2
Buffers: shared hit=5
-> Bitmap Index Scan on rumidx (cost=0.00..23.60 rows=1013 width=0) (actual time=0.024..0.024 rows=2 loops=1)
Index Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
Buffers: shared hit=3
Planning Time: 0.147 ms
Execution Time: 0.113 ms
(13 rows)
*/
-- 优化写法
with cte as (select id from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月')
select * from t_test_rum inner join cte on cte.id = t_test_rum.id limit 10;
-- 当然这也不是最终写法, 当真实数据量上去之后, 也许会不同