准备切割字符串的函数

drop function if exists fn_split_text_to_array(str text);
create or replace function fn_split_text_to_array(str text)
    returns text [] as $fn$
-- 将字符串按两个两个字符拆分
declare
    i      int = 1; -- int需要给默认值
    ret    text []; -- 数组不用给默认值,
    strlen int = length(str);
    substr text;
begin
    case strlen
-- 长度小于2不做处理
        WHEN 0, 1
            then
-- 长度为直接拼接
        WHEN 2
            then
                ret := ret || str;
-- 长度大于2每两个字符拆解
        else
            loop
                exit when i > strlen; -- while循环,退出条件
--raise notice 'i: %  length(str): %  substr(str, i, 2): %', i, strlen,substr(str, i, 2);
                substr = substr(str, i, 2);
                if (length(substr) > 1)
                then
                    ret := ret || substr;
                end if;
                i := i + 1; -- 循环退出标记
            end loop;
        end case;
    return ret;
end;
$fn$
    language plpgsql;
-- select fn_split_text_to_array('hello, my name''s henry morgan, 你出戏了');
-- select fn_split_text_to_array(content), content from t1 where id = 6443611;
drop function if exists str_to_tsvector(str text);
create or replace function str_to_tsvector(str text) returns tsvector
    language plpgsql
as
$$
declare
    ret  text [] = '{}' :: text [];
    item text;
begin
    -- 过滤中英文特殊符号, 替换成空格, 英文符号中去除了单引号, 因为英文单词常用, 去除了@符号, 因为邮箱常用, 去除了小数点,因为数字常用
    str = regexp_replace(str,
                         '[ ·{}()\[\]\r\n^|&*/#~,:、;?!\-"‖～§。？！，；：“”「」『』（）〔〕【】—…《》〈〉﹏]', ' ', 'g');
    -- 先按空格切分
    for item in select unnest(regexp_split_to_array(str, ' '))
        loop
            -- 判空
            if (length(trim(item)) < 2)
            then
            else
                if (item ~ '(^[\d]{1,}]$)|(^[\d%]{1,}]$)|(^[\d.]*$)|(^[A-Za-z]{1,}$)|(^[A-Za-z'']{1,}$)|(^[A-Za-z0-9_@.]{1,}$)')
                then
                    --raise notice 'item: %', item;
                    --raise notice 'fn_split_text_to_array(item): %', fn_split_text_to_array(item);
-- -- 纯数字, 纯英文, i'm, 邮箱, 直接拼接
                    ret = ret || item;
-- 另外处理小数点和@符号
                    ret = ret || regexp_split_to_array(item,'[.@]');
                else
                    -- 提取出两位以上不包括两位的连续字母和数字
-- 汉字按两两字符拆分
                    ret = ret || fn_split_text_to_array(item);
                end if;
            end if;
        end loop;
    --raise notice 'ret: % ', ret;
    return array_to_tsvector(ret);
end
$$;
select array_to_tsvector(str_segment('去你妈 的把'));
drop function  if exists str_segment;
create or replace function str_segment(text) returns text[] immutable as $$
declare
    -- 提取连续数字英文单词, 连续特殊符号组合, 及三组合
    reg1 text = '([a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖～§。？！，；：“”「」『』（）〔〕【】—…《》〈〉﹏“”‘’——]{2,})';
    -- 提取连续数字英文符号
    reg2 text = '([a-zA-Z0-9''_$#%^@\\]{2,})';
    -- 提取转移符号
    reg3 text = '([\\][a-zA-Z0-9+=%_$@.\\''·{}()\[\]^|&*/#~,:、;?!\-"‖～§。？！，；：“”「」『』（）〔〕【】—…《》〈〉﹏]{1})';
    -- 提取数字字母组合
    reg4 text = '([(a-zA-Z0-9)]{2,})';
    -- 提取类邮箱@qq.com, @gmail.com
    reg5 text = '([a-zA-Z''_$#%^@\\.]{2,})';
    -- 提取类邮箱@163.com
    reg6 text = '([0-9''_$#%^@\\.]{2,})';
    arr1 text[];
    arr2 text[];
    str text;
begin
    $1 = regexp_replace($1,'[\r\n]','g');
    select array_agg(vals[1]) into arr2 from regexp_matches($1,reg1,'g') as vals;
    arr1 = arr1 || arr2;
    select array_agg(vals[1]) into arr2 from regexp_matches($1,reg2,'g') as vals;
    arr1 = arr1 || arr2;
    select array_agg(vals[1]) into arr2 from regexp_matches($1,reg3,'g') as vals;
    arr1 = arr1 || arr2;
    select array_agg(vals[1]) into arr2 from regexp_matches($1,reg4,'g') as vals;
    arr1 = arr1 || arr2;
    select array_agg(vals[1]) into arr2 from regexp_matches($1,reg5,'g') as vals;
    arr1 = arr1 || arr2;
    select array_agg(vals[1]) into arr2 from regexp_matches($1,reg6,'g') as vals;
    arr1 = arr1 || arr2;
    -- 按1 2 4 号正则拆分, 基本上可以提取出符号 数字 字母之外的语言, 汉字 韩文 等
    for str in select unnest(regexp_split_to_array($1,reg1))
        loop
            arr1 = arr1 || fn_split_text_to_array(str);
        end loop;
    for str in select unnest(regexp_split_to_array($1,reg2))
        loop
            arr1 = arr1 || fn_split_text_to_array(str);
        end loop;
    for str in select unnest(regexp_split_to_array($1,reg4))
        loop
            arr1 = arr1 || fn_split_text_to_array(str);
        end loop;
    return arr1;
end;
$$ language plpgsql;

准备测试数据

create table t_test_rum(
    id serial8 primary key ,
    txt text not null
);

insert into t_test_rum(txt)
select concat(t_lastname.value,t_firstname.value) from t_lastname inner join t_firstname on true;

添加rum索引

create index rumidx on t_test_rum using rum (array_to_tsvector(str_segment(txt)) rum_tsvector_ops);

测试


-- 很快出结果
select * from t_test_rum where id > 10 limit 10;;

-- 一共两条
select count(1),array_agg(id) from  t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月';

-- 但你偏要查询3条
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;

-- 来看下执行计划
explain (analyze,verbose,timing,costs,buffers)
select * from t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月' order by id,1 desc limit 3;
/*
------------------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=0.42..191.36 rows=3 width=40) (actual time=35.241..35393.717 rows=2 loops=1)
   Output: id, txt
   Buffers: shared hit=2443
   ->  Index Scan using t_test_rum_pkey on public.t_test_rum  (cost=0.42..64475.62 rows=1013 width=40) (actual time=35.236..35393.702 rows=2 loops=1)
         Output: id, txt
         Filter: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
         Rows Removed by Filter: 275449
         Buffers: shared hit=2443
 Planning Time: 0.133 ms
 Execution Time: 35393.743 ms
(10 rows)
*/
/*
--------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=1826.70..1826.71 rows=1 width=40) (actual time=0.052..0.054 rows=1 loops=1)
   Output: count(1), array_agg(id)
   Buffers: shared hit=5
   ->  Bitmap Heap Scan on public.t_test_rum  (cost=23.85..1821.64 rows=1013 width=8) (actual time=0.032..0.037 rows=2 loops=1)
         Output: id, txt
         Recheck Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
         Heap Blocks: exact=2
         Buffers: shared hit=5
         ->  Bitmap Index Scan on rumidx  (cost=0.00..23.60 rows=1013 width=0) (actual time=0.024..0.024 rows=2 loops=1)
               Index Cond: (array_to_tsvector(str_segment(t_test_rum.txt)) @@ '''白月'''::tsquery)
               Buffers: shared hit=3
 Planning Time: 0.147 ms
 Execution Time: 0.113 ms
(13 rows)

*/

-- 优化写法
with cte as (select id from  t_test_rum where array_to_tsvector(str_segment(txt)) @@ '白月')
select * from t_test_rum inner join cte on cte.id = t_test_rum.id limit 10;

-- 当然这也不是最终写法, 当真实数据量上去之后, 也许会不同

postgresql笔记

Rum测试(此文废弃待重写

准备切割字符串的函数

准备测试数据

添加rum索引

测试

Rum测试(此文废弃 待重写

准备切割字符串的函数

准备测试数据

添加rum索引

测试

Rum测试(此文废弃待重写