1. 购买OSS存储表
2. 下载ClickHouse数据文件,并存储到OSS
:::info 通过阿里cloudshell(默认只让用1小时,1小时后开启新的实例)下载并传输到oss :::
wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv
aliyun oss cp pp-complete.csv oss://clickhouse-test-sinosoft/pp-complete.csv
:::warning 下载上传时间 1个小时之内 :::
处理完成后一定要删除 cloudshell里面得文件
3. 购买并导入数据
:::info 购买 ::: :::info 创建账户 ::: :::info 开通公网 ,并配置安全组 :::
3.1 创建表
CREATE TABLE uk_price_paid_local on cluster default
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
is_new UInt8,
duration Enum8('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String),
category UInt8
) ENGINE = MergeTree ORDER BY (postcode1, postcode2, addr1, addr2);
CREATE TABLE uk_price_paid_distributed on cluster default
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
is_new UInt8,
duration Enum8('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String),
category UInt8
) ENGINE = Distributed(default, default, uk_price_paid_local, rand());
3.2 查看OSS在线数据
SELECT * FROM oss('oss-cn-hangzhou-internal.aliyuncs.com', '<AK>', '<SK>', 'oss://clickhouse-test-sinosoft/pp-complete.csv', 'CSV','uuid String,price UInt32,time String,postcode String,a String,b String,c String,addr1 String,addr2 String,street String,locality String,town String,district String,county String,d String,e String');
3.3 导入数据
insert into uk_price_paid_distributed SELECT price,
toDate(concat(time,':00')) AS date,
splitByChar(' ', postcode)[1] AS postcode1,
splitByChar(' ', postcode)[2] AS postcode2,
transform(a, ['T', 'S', 'D', 'F', 'O'], ['terraced', 'semi-detached', 'detached', 'flat', 'other']) AS type,
b = 'Y' AS is_new,
transform(c, ['F', 'L', 'U'], ['freehold', 'leasehold', 'unknown']) AS duration,
addr1,
addr2,
street,
locality,
town,
district,
county,
d = 'B' AS category FROM oss('oss-cn-hangzhou-internal.aliyuncs.com', '<AK>', '<SK>', 'oss://clickhouse-test-sinosoft/pp-complete.csv', 'CSV','uuid String,price UInt32,time String,postcode String,a String,b String,c String,addr1 String,addr2 String,street String,locality String,town String,district String,county String,d String,e String');
:::info
注意: 1个实例分布式表 导入数据大概 250s 官方案例上面说的 40s (因为官方是单机器)
如果更多实例,这个时间上会有提升
:::
3.4 其他查询
SELECT count() FROM uk_price_paid_distributed;
SELECT formatReadableSize(total_bytes) FROM system.tables WHERE name = 'uk_price_paid_local';
-- 每年得平均价格 95ms
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 1000000, 80) FROM uk_price_paid_local GROUP BY year ORDER BY year;
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 1000000, 80) FROM uk_price_paid_distributed GROUP BY year ORDER BY year;
-- 每年得平均价格 伦敦
------------------ 48ms
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 2000000, 100) FROM uk_price_paid_local WHERE town = 'LONDON' GROUP BY year ORDER BY year;
------------------62ms
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 2000000, 100) FROM uk_price_paid_distributed WHERE town = 'LONDON' GROUP BY year ORDER BY year;
-- 查询昂贵的社区前100 148ms
SELECT
town,
district,
count() AS c,
round(avg(price)) AS price,
bar(price, 0, 5000000, 100)
FROM uk_price_paid_distributed
WHERE date >= '2020-01-01'
GROUP BY
town,
district
HAVING c >= 100
ORDER BY price DESC
LIMIT 100;