使用场景

  • 使用clickhouse(分布式表–> 多个本地表分片),导入数据时,为了不影响集群性能,不走分布式表,而走本地表
  • 完整截取一个文件,并分发到不同的地方处理,数据不出现重复

    脚本详情

    原始版

    image.png ```shell

    !/bin/bash

    [ $# -lt 1 ] && echo “usage: sh $0 xx.cvs” && exit

file=$1 count=wc -l $file |awk '{print $1}'

hosts=( 192.168.56.1 192.168.56.2 192.168.56.3 ) hostnums=${#hosts[@]}

x,y 当前数据所在的stop, start行号

x=1 page_num=1 page_size=2

while [ $x -le $count ] do [ $x = 1 ] && y=$x || y=$(( x + 1 )) let x+=${page_size} #分页大小,数据批次导入到一个主机中

flag=$(( pagenum % hostnums )) if [ $flag = 1 ] ;then sed -n “$y,$x p” $file |awk -v host=${hosts[0]} ‘{print “clickhouse -host “ host_ “ <— “ $0}’

  1. #sed -n "$y,$x p" $file | clickhouse-client -h 192.168.56.1 --port 9000 -d default -m -u default --password 123456 --format_csv_delimiter=$'|' --query="insert into test_yh.product_id format CSV"

elif [ $flag = 2 ] ;then sed -n “$y,$x p” $file |awk -v host=${hosts[1]} ‘{print “clickhouse -host “ host “ <—“ $0}’

  1. #sed -n "$y,$x p" $file | clickhouse-client -h ${hosts[2]} --port 9000 -d default -m -u default --password 123456 --format_csv_delimiter=$'|' --query="insert into test_yh.product_id format CSV"

else sed -n “$y,$x p” $file |awk -v host=${hosts[2]} ‘{print “clickhouse -host “ host “ <—“ $0}’

  1. #sed -n "$y,$x p" $file | clickhouse-client -h ${hosts[3]} --port 9000 -d default -m -u default --password 123456 --format_csv_delimiter=$'|' --query="insert into test_yh.product_id format CSV"

fi

let page_num++ done

  1. <a name="W5p9c"></a>
  2. #### 改进版
  3. ![image.png](https://cdn.nlark.com/yuque/0/2022/png/373293/1654696262060-eed287eb-9f1f-4eef-b1c0-4afa383955a0.png#clientId=u3084efa7-5b07-4&crop=0&crop=0&crop=1&crop=1&from=paste&id=u660ec4de&margin=%5Bobject%20Object%5D&name=image.png&originHeight=290&originWidth=572&originalType=url&ratio=1&rotation=0&showTitle=false&size=26242&status=done&style=none&taskId=u47caf519-c8a3-4af2-affe-7c7835e4aef&title=)
  4. ```shell
  5. #!/bin/bash
  6. [ $# -lt 1 ] && echo "usage: sh $0 xx.cvs" && exit
  7. file=$1
  8. count=` wc -l $file |awk '{print $1}' `
  9. hosts=(
  10. 192.168.56.1
  11. 192.168.56.2
  12. 192.168.56.3
  13. )
  14. hostnums=${#hosts[@]}
  15. #批次分页:所在页码,页大小
  16. page_size=2
  17. #当前数据所在的stop, start行号 (不需要改动)
  18. page_num=1
  19. start_num=1
  20. stop_num=0
  21. #遍历所有文件的数据
  22. while [ $stop_num -le $count ]
  23. do
  24. let stop_num+=${page_size}
  25. #根据页码来分配数据,取模1==>host1, 取模2==>host2
  26. flag=$(( page_num % hostnums ))
  27. if [ $flag = 1 ] ;then
  28. sed -n "$start_num,$stop_num p" $file |awk -v host_=${hosts[0]} '{print "clickhouse -host " host_ " <-- " $0}'
  29. #sed -n "$start_num,$stop_num p" $file | clickhouse-client -h 192.168.56.1 --port 9000 -d default -m -u default --password 123456 --format_csv_delimiter=$'|' --query="insert into test_yh.product_id format CSV"
  30. elif [ $flag = 2 ] ;then
  31. sed -n "$start_num,$stop_num p" $file |awk -v host_=${hosts[1]} '{print "clickhouse -host " host_ " <--" $0}'
  32. #sed -n "$start_num,$stop_num p" $file | clickhouse-client -h ${hosts[2]} --port 9000 -d default -m -u default --password 123456 --format_csv_delimiter=$'|' --query="insert into test_yh.product_id format CSV"
  33. else
  34. sed -n "$start_num,$stop_num p" $file |awk -v host_=${hosts[2]} '{print "clickhouse -host " host_ " <--" $0}'
  35. #sed -n "$start_num,$stop_num p" $file | clickhouse-client -h ${hosts[3]} --port 9000 -d default -m -u default --password 123456 --format_csv_delimiter=$'|' --query="insert into test_yh.product_id format CSV"
  36. fi
  37. let start_num=${stop_num}+1
  38. let page_num++
  39. done

最终版

image.png

  1. #!/bin/bash
  2. [ $# -lt 1 ] && echo "usage: sh $0 xx.cvs" && exit
  3. file=$1
  4. count=` wc -l $file |awk '{print $1}' `
  5. hosts=(
  6. 192.168.56.1
  7. 192.168.56.2
  8. 192.168.56.3
  9. )
  10. hostnums=${#hosts[@]}
  11. #批次分页:所在页码,页大小
  12. page_size=2
  13. #当前数据所在的stop, start行号 (不需要改动)
  14. page_num=1
  15. start_num=1
  16. stop_num=0
  17. #遍历所有文件的数据
  18. while [ $stop_num -le $count ]
  19. do
  20. let stop_num+=${page_size}
  21. #根据页码来分配数据,取模1==>host1, 取模2==>host2
  22. flag=$(( page_num % hostnums ))
  23. index=$(( flag - 1 ))
  24. host=${hosts[$index]}
  25. sed -n "$start_num,$stop_num p" $file |awk -v host_=${host} '{print "clickhouse -host " host_ " <-- " $0}'
  26. #sed -n "$start_num,$stop_num p" $file | clickhouse-client -h ${host} --port 9000 -d default -m -u default --password 123456 --format_csv_delimiter=$'|' --query="insert into test_yh.product_id format CSV"
  27. let start_num=${stop_num}+1
  28. let page_num++
  29. done

转载:
截取分发指定行的数据:sed,clickhouse-client导入数据