Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
From b409892ae5028965a6fe98dde1346594807e6e45 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Mon, 27 May 2024 21:32:07 -0400
Subject: [PATCH] Linux 6.10: rework queue limits setup
Linux has started moving to a model where instead of applying block
queue limits through individual modification functions, a complete
limits structure is built up and applied atomically, either when the
block device or open, or some time afterwards. As of 6.10 this
transition appears only partly completed.
This commit matches that model within OpenZFS in a way that should work
for past and future kernels. We set up a queue limits structure with any
limits that have had their modification functions removed. For newer
kernels that can have limits applied at block device open
(HAVE_BLK_ALLOC_DISK_2ARG), we have a conversion function to turn the
OpenZFS queue limits structure into Linux's queue_limits structure,
which can then be passed in. For older kernels, we provide an
application function that just calls the old functions for each limit in
the structure.
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
config/kernel-blk-queue.m4 | 4 +-
module/os/linux/zfs/zvol_os.c | 186 +++++++++++++++++++++-------------
2 files changed, 118 insertions(+), 72 deletions(-)
diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4
index 15dbe1c7dff0..2f0b386e6637 100644
--- a/config/kernel-blk-queue.m4
+++ b/config/kernel-blk-queue.m4
@@ -332,7 +332,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [
ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [
AC_MSG_RESULT(yes)
],[
- ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors])
+ AC_MSG_RESULT(no)
])
])
@@ -355,7 +355,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [
AC_MSG_RESULT(yes)
], [
- ZFS_LINUX_TEST_ERROR([blk_queue_max_segments])
+ AC_MSG_RESULT(no)
])
])
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 1d5d54b80ea1..c01caa6da8b4 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1076,8 +1076,106 @@ static const struct block_device_operations zvol_ops = {
#endif
};
+typedef struct zvol_queue_limits {
+ unsigned int zql_max_hw_sectors;
+ unsigned short zql_max_segments;
+ unsigned int zql_max_segment_size;
+ unsigned int zql_io_opt;
+} zvol_queue_limits_t;
+
+static void
+zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
+ boolean_t use_blk_mq)
+{
+ limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
+
+ if (use_blk_mq) {
+ /*
+ * IO requests can be really big (1MB). When an IO request
+ * comes in, it is passed off to zvol_read() or zvol_write()
+ * in a new thread, where it is chunked up into 'volblocksize'
+ * sized pieces and processed. So for example, if the request
+ * is a 1MB write and your volblocksize is 128k, one zvol_write
+ * thread will take that request and sequentially do ten 128k
+ * IOs. This is due to the fact that the thread needs to lock
+ * each volblocksize sized block. So you might be wondering:
+ * "instead of passing the whole 1MB request to one thread,
+ * why not pass ten individual 128k chunks to ten threads and
+ * process the whole write in parallel?" The short answer is
+ * that there's a sweet spot number of chunks that balances
+ * the greater parallelism with the added overhead of more
+ * threads. The sweet spot can be different depending on if you
+ * have a read or write heavy workload. Writes typically want
+ * high chunk counts while reads typically want lower ones. On
+ * a test pool with 6 NVMe drives in a 3x 2-disk mirror
+ * configuration, with volblocksize=8k, the sweet spot for good
+ * sequential reads and writes was at 8 chunks.
+ */
+
+ /*
+ * Below we tell the kernel how big we want our requests
+ * to be. You would think that blk_queue_io_opt() would be
+ * used to do this since it is used to "set optimal request
+ * size for the queue", but that doesn't seem to do
+ * anything - the kernel still gives you huge requests
+ * with tons of little PAGE_SIZE segments contained within it.
+ *
+ * Knowing that the kernel will just give you PAGE_SIZE segments
+ * no matter what, you can say "ok, I want PAGE_SIZE byte
+ * segments, and I want 'N' of them per request", where N is
+ * the correct number of segments for the volblocksize and
+ * number of chunks you want.
+ */
+#ifdef HAVE_BLK_MQ
+ if (zvol_blk_mq_blocks_per_thread != 0) {
+ unsigned int chunks;
+ chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
+
+ limits->zql_max_segment_size = PAGE_SIZE;
+ limits->zql_max_segments =
+ (zv->zv_volblocksize * chunks) / PAGE_SIZE;
+ } else {
+ /*
+ * Special case: zvol_blk_mq_blocks_per_thread = 0
+ * Max everything out.
+ */
+ limits->zql_max_segments = UINT16_MAX;
+ limits->zql_max_segment_size = UINT_MAX;
+ }
+ } else {
+#endif
+ limits->zql_max_segments = UINT16_MAX;
+ limits->zql_max_segment_size = UINT_MAX;
+ }
+
+ limits->zql_io_opt = zv->zv_volblocksize;
+}
+
+#ifdef HAVE_BLK_ALLOC_DISK_2ARG
+static void
+zvol_queue_limits_convert(zvol_queue_limits_t *limits,
+ struct queue_limits *qlimits)
+{
+ memset(qlimits, 0, sizeof (struct queue_limits));
+ qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
+ qlimits->max_segments = limits->zql_max_segments;
+ qlimits->max_segment_size = limits->zql_max_segment_size;
+ qlimits->io_opt = limits->zql_io_opt;
+}
+#else
+static void
+zvol_queue_limits_apply(zvol_queue_limits_t *limits,
+ struct request_queue *queue)
+{
+ blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
+ blk_queue_max_segments(queue, limits->zql_max_segments);
+ blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
+ blk_queue_io_opt(queue, limits->zql_io_opt);
+}
+#endif
+
static int
-zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
+zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
{
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
#if defined(HAVE_BLK_ALLOC_DISK)
@@ -1087,8 +1185,11 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
zso->zvo_disk->minors = ZVOL_MINORS;
zso->zvo_queue = zso->zvo_disk->queue;
+ zvol_queue_limits_apply(limits, zso->zvo_queue);
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
- struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+ struct queue_limits qlimits;
+ zvol_queue_limits_convert(limits, &qlimits);
+ struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
if (IS_ERR(disk)) {
zso->zvo_disk = NULL;
return (1);
@@ -1109,6 +1210,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
}
zso->zvo_disk->queue = zso->zvo_queue;
+ zvol_queue_limits_apply(limits, zso->zvo_queue);
#endif /* HAVE_BLK_ALLOC_DISK */
#else
zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
@@ -1122,13 +1224,14 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
}
zso->zvo_disk->queue = zso->zvo_queue;
+ zvol_queue_limits_apply(limits, zso->zvo_queue);
#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
return (0);
}
static int
-zvol_alloc_blk_mq(zvol_state_t *zv)
+zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
{
#ifdef HAVE_BLK_MQ
struct zvol_state_os *zso = zv->zv_zso;
@@ -1144,9 +1247,12 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
return (1);
}
zso->zvo_queue = zso->zvo_disk->queue;
+ zvol_queue_limits_apply(limits, zso->zvo_queue);
zso->zvo_disk->minors = ZVOL_MINORS;
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
- struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
+ struct queue_limits qlimits;
+ zvol_queue_limits_convert(limits, &qlimits);
+ struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
if (IS_ERR(disk)) {
zso->zvo_disk = NULL;
blk_mq_free_tag_set(&zso->tag_set);
@@ -1172,6 +1278,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
/* Our queue is now created, assign it to our disk */
zso->zvo_disk->queue = zso->zvo_queue;
+ zvol_queue_limits_apply(limits, zso->zvo_queue);
#endif
#endif
@@ -1211,6 +1318,9 @@ zvol_alloc(dev_t dev, const char *name)
zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
#endif
+ zvol_queue_limits_t limits;
+ zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
+
/*
* The block layer has 3 interfaces for getting BIOs:
*
@@ -1227,10 +1337,10 @@ zvol_alloc(dev_t dev, const char *name)
* disk and the queue separately. (5.13 kernel or older)
*/
if (zv->zv_zso->use_blk_mq) {
- ret = zvol_alloc_blk_mq(zv);
+ ret = zvol_alloc_blk_mq(zv, &limits);
zso->zvo_disk->fops = &zvol_ops_blk_mq;
} else {
- ret = zvol_alloc_non_blk_mq(zso);
+ ret = zvol_alloc_non_blk_mq(zso, &limits);
zso->zvo_disk->fops = &zvol_ops;
}
if (ret != 0)
@@ -1514,74 +1624,10 @@ zvol_os_create_minor(const char *name)
set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
- blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
- (DMU_MAX_ACCESS / 4) >> 9);
- if (zv->zv_zso->use_blk_mq) {
- /*
- * IO requests can be really big (1MB). When an IO request
- * comes in, it is passed off to zvol_read() or zvol_write()
- * in a new thread, where it is chunked up into 'volblocksize'
- * sized pieces and processed. So for example, if the request
- * is a 1MB write and your volblocksize is 128k, one zvol_write
- * thread will take that request and sequentially do ten 128k
- * IOs. This is due to the fact that the thread needs to lock
- * each volblocksize sized block. So you might be wondering:
- * "instead of passing the whole 1MB request to one thread,
- * why not pass ten individual 128k chunks to ten threads and
- * process the whole write in parallel?" The short answer is
- * that there's a sweet spot number of chunks that balances
- * the greater parallelism with the added overhead of more
- * threads. The sweet spot can be different depending on if you
- * have a read or write heavy workload. Writes typically want
- * high chunk counts while reads typically want lower ones. On
- * a test pool with 6 NVMe drives in a 3x 2-disk mirror
- * configuration, with volblocksize=8k, the sweet spot for good
- * sequential reads and writes was at 8 chunks.
- */
-
- /*
- * Below we tell the kernel how big we want our requests
- * to be. You would think that blk_queue_io_opt() would be
- * used to do this since it is used to "set optimal request
- * size for the queue", but that doesn't seem to do
- * anything - the kernel still gives you huge requests
- * with tons of little PAGE_SIZE segments contained within it.
- *
- * Knowing that the kernel will just give you PAGE_SIZE segments
- * no matter what, you can say "ok, I want PAGE_SIZE byte
- * segments, and I want 'N' of them per request", where N is
- * the correct number of segments for the volblocksize and
- * number of chunks you want.
- */
-#ifdef HAVE_BLK_MQ
- if (zvol_blk_mq_blocks_per_thread != 0) {
- unsigned int chunks;
- chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
-
- blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
- PAGE_SIZE);
- blk_queue_max_segments(zv->zv_zso->zvo_queue,
- (zv->zv_volblocksize * chunks) / PAGE_SIZE);
- } else {
- /*
- * Special case: zvol_blk_mq_blocks_per_thread = 0
- * Max everything out.
- */
- blk_queue_max_segments(zv->zv_zso->zvo_queue,
- UINT16_MAX);
- blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
- UINT_MAX);
- }
-#endif
- } else {
- blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
- blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
- }
blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
zv->zv_volblocksize);
- blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
blk_queue_discard_granularity(zv->zv_zso->zvo_queue,