准备切割字符串的函数

  1. drop function if exists fn_split_text_to_array(str text);
  2. create or replace function fn_split_text_to_array(str text)
  3. returns text [] as $fn$
  4. -- 将字符串按两个两个字符拆分
  5. declare
  6. i int = 1; -- int需要给默认值
  7. ret text []; -- 数组不用给默认值,
  8. strlen int = length(str);
  9. substr text;
  10. begin
  11. case strlen
  12. -- 长度小于2不做处理
  13. WHEN 0, 1
  14. then
  15. -- 长度为直接拼接
  16. WHEN 2
  17. then
  18. ret := ret || str;
  19. -- 长度大于2每两个字符拆解
  20. else
  21. loop
  22. exit when i > strlen; -- while循环,退出条件
  23. --raise notice 'i: % length(str): % substr(str, i, 2): %', i, strlen,substr(str, i, 2);
  24. substr = substr(str, i, 2);
  25. if (length(substr) > 1)
  26. then
  27. ret := ret || substr;
  28. end if;
  29. i := i + 1; -- 循环退出标记
  30. end loop;
  31. end case;
  32. return ret;
  33. end;
  34. $fn$
  35. language plpgsql;
  36. -- select fn_split_text_to_array('hello, my name''s henry morgan, 你出戏了');
  37. -- select fn_split_text_to_array(content), content from t1 where id = 6443611;
  38. drop function if exists str_to_tsvector(str text);
  39. create or replace function str_to_tsvector(str text) returns tsvector
  40. language plpgsql
  41. as
  42. $$
  43. declare
  44. ret text [] = '{}' :: text [];
  45. item text;
  46. begin
  47. -- 过滤中英文特殊符号, 替换成空格, 英文符号中去除了单引号, 因为英文单词常用, 去除了@符号, 因为邮箱常用, 去除了小数点,因为数字常用
  48. str = regexp_replace(str,
  49. '[ ·{}()\[\]\r\n^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏]', ' ', 'g');
  50. -- 先按空格切分
  51. for item in select unnest(regexp_split_to_array(str, ' '))
  52. loop
  53. -- 判空
  54. if (length(trim(item)) < 2)
  55. then
  56. else
  57. if (item ~ '(^[\d]{1,}]$)|(^[\d%]{1,}]$)|(^[\d.]*$)|(^[A-Za-z]{1,}$)|(^[A-Za-z'']{1,}$)|(^[A-Za-z0-9_@.]{1,}$)')
  58. then
  59. --raise notice 'item: %', item;
  60. --raise notice 'fn_split_text_to_array(item): %', fn_split_text_to_array(item);
  61. -- -- 纯数字, 纯英文, i'm, 邮箱, 直接拼接
  62. ret = ret || item;
  63. -- 另外处理小数点和@符号
  64. ret = ret || regexp_split_to_array(item,'[.@]');
  65. else
  66. -- 提取出两位以上不包括两位的连续字母和数字
  67. -- 汉字按两两字符拆分
  68. ret = ret || fn_split_text_to_array(item);
  69. end if;
  70. end if;
  71. end loop;
  72. --raise notice 'ret: % ', ret;
  73. return array_to_tsvector(ret);
  74. end
  75. $$;
  76. select array_to_tsvector(str_segment('去你妈 的把'));
  77. drop function if exists str_segment;
  78. create or replace function str_segment(text) returns text[] immutable as $$
  79. declare
  80. -- 提取连续数字英文单词, 连续特殊符号组合, 及三组合
  81. reg1 text = '([a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏“”‘’——]{2,})';
  82. -- 提取连续数字英文符号
  83. reg2 text = '([a-zA-Z0-9''_$#%^@\\]{2,})';
  84. -- 提取转移符号
  85. reg3 text = '([\\][a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖~§。?!,;:“”「」『』()〔〕【】—…《》〈〉﹏]{1})';
  86. -- 提取数字字母组合
  87. reg4 text = '([(a-zA-Z0-9)]{2,})';
  88. -- 提取类邮箱@qq.com, @gmail.com
  89. reg5 text = '([a-zA-Z''_$#%^@\\.]{2,})';
  90. -- 提取类邮箱@163.com
  91. reg6 text = '([0-9''_$#%^@\\.]{2,})';
  92. arr1 text[];
  93. arr2 text[];
  94. str text;
  95. begin
  96. $1 = regexp_replace($1,'[\r\n]','g');
  97. select array_agg(vals[1]) into arr2 from regexp_matches($1,reg1,'g') as vals;
  98. arr1 = arr1 || arr2;
  99. select array_agg(vals[1]) into arr2 from regexp_matches($1,reg2,'g') as vals;
  100. arr1 = arr1 || arr2;
  101. select array_agg(vals[1]) into arr2 from regexp_matches($1,reg3,'g') as vals;
  102. arr1 = arr1 || arr2;
  103. select array_agg(vals[1]) into arr2 from regexp_matches($1,reg4,'g') as vals;
  104. arr1 = arr1 || arr2;
  105. select array_agg(vals[1]) into arr2 from regexp_matches($1,reg5,'g') as vals;
  106. arr1 = arr1 || arr2;
  107. select array_agg(vals[1]) into arr2 from regexp_matches($1,reg6,'g') as vals;
  108. arr1 = arr1 || arr2;
  109. -- 1 2 4 号正则拆分, 基本上可以提取出符号 数字 字母之外的语言, 汉字 韩文
  110. for str in select unnest(regexp_split_to_array($1,reg1))
  111. loop
  112. arr1 = arr1 || fn_split_text_to_array(str);
  113. end loop;
  114. for str in select unnest(regexp_split_to_array($1,reg2))
  115. loop
  116. arr1 = arr1 || fn_split_text_to_array(str);
  117. end loop;
  118. for str in select unnest(regexp_split_to_array($1,reg4))
  119. loop
  120. arr1 = arr1 || fn_split_text_to_array(str);
  121. end loop;
  122. return arr1;
  123. end;
  124. $$ language plpgsql;

准备测试数据

create table t_test_rum(
    id serial8 primary key ,
    txt text not null
);

insert into t_test_rum(txt)
select concat(t_lastname.value,t_firstname.value) from t_lastname inner join t_firstname on true;

添加rum索引

create index rumidx on t_test_rum using rum (array_to_tsvector(str_segment(txt)) rum_tsvector_ops);

测试


-- 很快出结果
select * from t_test_rum where id > 10 limit 10;;

-- 一共两条
select count(1),array_agg(id) from  t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月';

-- 但你偏要查询3条
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;

-- 来看下执行计划
explain (analyze,verbose,timing,costs,buffers)
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;
/*
------------------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=0.42..191.36 rows=3 width=40) (actual time=35.241..35393.717 rows=2 loops=1)
   Output: id, txt
   Buffers: shared hit=2443
   ->  Index Scan using t_test_rum_pkey on public.t_test_rum  (cost=0.42..64475.62 rows=1013 width=40) (actual time=35.236..35393.702 rows=2 loops=1)
         Output: id, txt
         Filter: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
         Rows Removed by Filter: 275449
         Buffers: shared hit=2443
 Planning Time: 0.133 ms
 Execution Time: 35393.743 ms
(10 rows)
*/
/*
--------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=1826.70..1826.71 rows=1 width=40) (actual time=0.052..0.054 rows=1 loops=1)
   Output: count(1), array_agg(id)
   Buffers: shared hit=5
   ->  Bitmap Heap Scan on public.t_test_rum  (cost=23.85..1821.64 rows=1013 width=8) (actual time=0.032..0.037 rows=2 loops=1)
         Output: id, txt
         Recheck Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
         Heap Blocks: exact=2
         Buffers: shared hit=5
         ->  Bitmap Index Scan on rumidx  (cost=0.00..23.60 rows=1013 width=0) (actual time=0.024..0.024 rows=2 loops=1)
               Index Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
               Buffers: shared hit=3
 Planning Time: 0.147 ms
 Execution Time: 0.113 ms
(13 rows)

*/

-- 优化写法
with cte as (select id from  t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月')
select * from t_test_rum inner join cte on cte.id = t_test_rum.id limit 10;

-- 当然这也不是最终写法, 当真实数据量上去之后, 也许会不同