CRUSH英文全称为"Controlled Replication Under Scalable Hashing",是Ceph的核心设计之一,它本质上是ceph存储集群使用的一种数据分发算法,类似于openstack的swift的AQS对象存储所使用的哈希和一致性hash数据分布算法。
CRUSH算法通过接收多维参数,通过一定的计算对客户端对象数据进行分布存位置的确定,来解决数据动态分发的问题。因此ceph客户端无需经过传统查表的方式来获取数据的索引,进而根据索引来读取数据,只需通过crush算法计算后直接和对应的OSD交互进行数据读写。这样,ceph就避免了查表这种传统中心化存在的单点故障,性能瓶颈以及不易扩展的缺陷。这就是ceph相较于其他分布式存储系统具有高扩展性,高可用和高性能特点的主要原因。
ceph中的寻找至少要经历以下三次映射:
- 1.File和object映射:
文件数据object的数据块切片操作,便于多数据的并行化处理。
- 2.Object和PG映射:
将文件数据切分后的每一个Object通过简单的Hash算法归到一个PG中。
- 3.PG和OSD映射:
将PG映射到主机实际的OSD数据磁盘上。
CRUSH算法提供了配置和更改和数据动态再平衡等关键特性,而CRUSH算法存储数据对象的过程可通过CRUSH Map控制并进行自定义修改,CRUSH map是ceph集群物理拓扑结构,副本策略以及故障域等信息抽象配置段,借助于CRUSH Map可以将数据伪随机地分布到集群的各个OSD上。
OSD出现异常的时候,为了避免故障风暴,往往会实现一个所谓的故障域。
[root@ceph141 ~]# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 5.34389 root default
-3 1.78130 host ceph141
0 hdd 0.29300 osd.0 up 1.00000 1.00000
1 hdd 0.48830 osd.1 up 1.00000 1.00000
2 hdd 1.00000 osd.2 up 1.00000 1.00000
-5 1.78130 host ceph142
3 hdd 0.29300 osd.3 up 1.00000 1.00000
4 hdd 0.48830 osd.4 up 1.00000 1.00000
5 hdd 1.00000 osd.5 up 1.00000 1.00000
-7 1.78130 host ceph143
6 hdd 0.29300 osd.6 up 1.00000 1.00000
7 hdd 0.48830 osd.7 up 1.00000 1.00000
8 hdd 1.00000 osd.8 up 1.00000 1.00000
[root@ceph141 ~]#
[root@ceph141 ~]# ceph osd getcrushmap -o lax-hdd.file
25
[root@ceph141 ~]#
[root@ceph141 ~]# file lax-hdd.file
lax-hdd.file: data
[root@ceph141 ~]#
[root@ceph141 ~]# ll lax-hdd.file
-rw-r--r-- 1 root root 1173 Apr 3 17:35 lax-hdd.file
[root@ceph141 ~]#
[root@ceph141 ~]#
[root@ceph141 ~]# apt -y install ceph-base
[root@ceph141 ~]#
[root@ceph141 ~]# crushtool -d lax-hdd.file -o lax-hdd-ssd.file
[root@ceph141 ~]#
[root@ceph141 ~]# file lax-hdd*
lax-hdd.file: data
lax-hdd-ssd.file: ASCII text
[root@ceph141 ~]#
[root@ceph141 ~]# cat lax-hdd-ssd.file
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class hdd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host ceph141 {
id -3 # do not change unnecessarily
id -4 class hdd # do not change unnecessarily
# weight 1.78130
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.29300
item osd.1 weight 0.48830
item osd.2 weight 1.00000
}
host ceph142 {
id -5 # do not change unnecessarily
id -6 class hdd # do not change unnecessarily
# weight 1.78130
alg straw2
hash 0 # rjenkins1
item osd.3 weight 0.29300
item osd.4 weight 0.48830
item osd.5 weight 1.00000
}
host ceph143 {
id -7 # do not change unnecessarily
id -8 class hdd # do not change unnecessarily
# weight 1.78130
alg straw2
hash 0 # rjenkins1
item osd.6 weight 0.29300
item osd.7 weight 0.48830
item osd.8 weight 1.00000
}
root default {
id -1 # do not change unnecessarily
id -2 class hdd # do not change unnecessarily
# weight 5.34389
alg straw2
hash 0 # rjenkins1
item ceph141 weight 1.78130
item ceph142 weight 1.78130
item ceph143 weight 1.78130
}
# rules
rule replicated_rule {
id 0
type replicated
step take default
step chooseleaf firstn 0 type host
step emit
}
rule erasure-code {
id 1
type erasure
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
# end crush map
[root@ceph141 ~]#
[root@ceph141 ~]# cat lax-hdd-ssd.file
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class hdd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host ceph141 {
id -3 # do not change unnecessarily
id -4 class hdd # do not change unnecessarily
# weight 1.78130
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.29300
item osd.1 weight 0.48830
item osd.2 weight 1.00000
}
host ceph142 {
id -5 # do not change unnecessarily
id -6 class hdd # do not change unnecessarily
# weight 1.78130
alg straw2
hash 0 # rjenkins1
item osd.3 weight 0.29300
item osd.4 weight 0.48830
item osd.5 weight 1.00000
}
host ceph143 {
id -7 # do not change unnecessarily
id -8 class hdd # do not change unnecessarily
# weight 1.78130
alg straw2
hash 0 # rjenkins1
item osd.6 weight 0.29300
item osd.7 weight 0.48830
item osd.8 weight 1.00000
}
host ceph141-hdd {
id -13 # do not change unnecessarily
id -14 class hdd # do not change unnecessarily
# weight 1.75789
alg straw2
hash 0 # rjenkins1
item osd.2 weight 0.97659
}
host ceph142-hdd {
id -15 # do not change unnecessarily
id -16 class hdd # do not change unnecessarily
# weight 2.75789
alg straw2
hash 0 # rjenkins1
item osd.5 weight 0.97659
}
host ceph143-hdd {
id -17 # do not change unnecessarily
id -18 class hdd # do not change unnecessarily
# weight 1.75789
alg straw2
hash 0 # rjenkins1
item osd.8 weight 0.97659
}
root default {
id -1 # do not change unnecessarily
id -2 class hdd # do not change unnecessarily
# weight 5.34389
alg straw2
hash 0 # rjenkins1
item ceph141 weight 1.78130
item ceph142 weight 1.78130
item ceph143 weight 1.78130
}
root ceph-hdd {
id -11 # do not change unnecessarily
id -12 class hdd # do not change unnecessarily
# weight 6.27367
alg straw2
hash 0 # rjenkins1
item ceph141-hdd weight 1.75789
item ceph142-hdd weight 2.75789
item ceph143-hdd weight 1.75789
}
# rules
rule replicated_rule {
id 0
type replicated
step take default
step chooseleaf firstn 0 type host
step emit
}
rule erasure-code {
id 1
type erasure
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
rule violet_hdd_rule {
id 10
type replicated
step take ceph-hdd
step chooseleaf firstn 0 type host
step emit
}
# end crush map
[root@ceph141 ~]#
[root@ceph141 ~]# crushtool -c lax-hdd-ssd.file -o lax-hdd-ssd.crushmap
[root@ceph141 ~]#
[root@ceph141 ~]# file lax-hdd-ssd.crushmap
lax-hdd-ssd.crushmap: data
[root@ceph141 ~]#
[root@ceph141 ~]# ceph osd setcrushmap -i lax-hdd-ssd.crushmap
26
[root@ceph141 ~]#
[root@ceph141 ~]# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-11 6.27367 root ceph-hdd
-13 1.75789 host ceph141-hdd
2 hdd 0.97658 osd.2 up 1.00000 1.00000
-15 2.75789 host ceph142-hdd
5 hdd 0.97658 osd.5 up 1.00000 1.00000
-17 1.75789 host ceph143-hdd
8 hdd 0.97658 osd.8 up 1.00000 1.00000
-1 5.34389 root default
-3 1.78130 host ceph141
0 hdd 0.29300 osd.0 up 1.00000 1.00000
1 hdd 0.48830 osd.1 up 1.00000 1.00000
2 hdd 1.00000 osd.2 up 1.00000 1.00000
-5 1.78130 host ceph142
3 hdd 0.29300 osd.3 up 1.00000 1.00000
4 hdd 0.48830 osd.4 up 1.00000 1.00000
5 hdd 1.00000 osd.5 up 1.00000 1.00000
-7 1.78130 host ceph143
6 hdd 0.29300 osd.6 up 1.00000 1.00000
7 hdd 0.48830 osd.7 up 1.00000 1.00000
8 hdd 1.00000 osd.8 up 1.00000 1.00000
[root@ceph141 ~]#
9.1.创建存储池
[root@ceph141 ~]# ceph osd pool create violet-hdd 8 8 replicated violet_hdd_rule
pool 'violet-hdd' created
[root@ceph141 ~]#
9.2.查看存储池对应的规则id(启动ssd对应的crush_rule为1,而hdd对应的crush_rule为2)
[root@ceph141 ~]# ceph osd pool ls detail | grep hdd
pool 21 'violet-hdd' replicated size 3 min_size 2 crush_rule 10 object_hash rjenkins pg_num 8 pgp_num 8 autoscale_mode on last_change 1270 flags hashpspool stripe_width 0 read_balance_score 1.50
[root@ceph141 ~]#
9.3.查看ssd的PG底层对应的OSD范围为[7-9,共计3块磁盘]
[root@ceph141 ~]# ceph pg ls-by-pool violet-hdd
PG OBJECTS DEGRADED MISPLACED UNFOUND BYTES OMAP_BYTES* OMAP_KEYS* LOG LOG_DUPS STATE SINCE VERSION REPORTED UP ACTING SCRUB_STAMP DEEP_SCRUB_STAMP LAST_SCRUB_DURATION SCRUB_SCHEDULING
21.0 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [5,2,8]p5 [5,2,8]p5 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T19:04:34.938020+0000
21.1 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [2,5,8]p2 [2,5,8]p2 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T12:34:40.325695+0000
21.2 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [5,8,2]p5 [5,8,2]p5 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T16:51:12.557603+0000
21.3 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [5,2,8]p5 [5,2,8]p5 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T21:22:02.905517+0000
21.4 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [2,8,5]p2 [2,8,5]p2 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T14:12:50.656110+0000
21.5 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [5,8,2]p5 [5,8,2]p5 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T11:01:59.247146+0000
21.6 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [8,5,2]p8 [8,5,2]p8 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T16:40:00.911357+0000
21.7 0 0 0 0 0 0 0 0 0 active+clean 43s 0'0 1270:13 [2,5,8]p2 [2,5,8]p2 2025-04-03T09:50:24.897815+0000 2025-04-03T09:50:24.897815+0000 0 periodic scrub scheduled @ 2025-04-04T10:00:09.178814+0000
* NOTE: Omap statistics are gathered during deep scrub and may be inaccurate soon afterwards depending on utilization. See http://docs.ceph.com/en/latest/dev/placement-group/#omap-statistics for further details.
[root@ceph141 ~]#
10.1 写入测试数据
[root@ceph141 ~]# ceph df
--- RAW STORAGE ---
CLASS SIZE AVAIL USED RAW USED %RAW USED
hdd 5.3 TiB 5.3 TiB 90 GiB 90 GiB 1.65
TOTAL 5.3 TiB 5.3 TiB 90 GiB 90 GiB 1.65
--- POOLS ---
POOL ID PGS STORED OBJECTS USED %USED MAX AVAIL
...
violet-hdd 21 8 0 B 0 0 B 0 957 GiB
[root@ceph141 ~]#
[root@ceph141 ~]# ll -h /etc/hosts
-rw-r--r-- 1 root root 312 Apr 3 11:50 /etc/hosts
[root@ceph141 ~]#
[root@ceph141 ~]#
[root@ceph141 ~]# rados put myhosts /etc/hosts -p violet-hdd
[root@ceph141 ~]#
[root@ceph141 ~]# ceph df
--- RAW STORAGE ---
CLASS SIZE AVAIL USED RAW USED %RAW USED
hdd 5.3 TiB 5.3 TiB 90 GiB 90 GiB 1.65
TOTAL 5.3 TiB 5.3 TiB 90 GiB 90 GiB 1.65
--- POOLS ---
POOL ID PGS STORED OBJECTS USED %USED MAX AVAIL
...
violet-hdd 21 8 312 B 1 12 KiB 0 957 GiB
[root@ceph141 ~]#
10.2 查看对象对应的OSD映射关系
[root@ceph141 ~]# ceph osd map violet-hdd myhosts
osdmap e1270 pool 'violet-hdd' (21) object 'myhosts' -> pg 21.5a39779b (21.3) -> up ([5,2,8], p5) acting ([5,2,8], p5)
[root@ceph141 ~]#
推荐阅读:
https://www.cnblogs.com/lax/p/18403765
- ceph集群的OSD缩容
参考链接:
https://www.cnblogs.com/lax/p/18370804