@@ -49,6 +49,7 @@ struct dax_region {
49
49
* @region - parent region
50
50
* @dev - device backing the character device
51
51
* @kref - enable this data to be tracked in filp->private_data
52
+ * @alive - !alive + rcu grace period == no new mappings can be established
52
53
* @id - child id in the region
53
54
* @num_resources - number of physical address extents in this device
54
55
* @res - array of physical address ranges
@@ -57,6 +58,7 @@ struct dax_dev {
57
58
struct dax_region * region ;
58
59
struct device * dev ;
59
60
struct kref kref ;
61
+ bool alive ;
60
62
int id ;
61
63
int num_resources ;
62
64
struct resource res [0 ];
@@ -150,6 +152,16 @@ static void unregister_dax_dev(void *_dev)
150
152
151
153
dev_dbg (dev , "%s\n" , __func__ );
152
154
155
+ /*
156
+ * Note, rcu is not protecting the liveness of dax_dev, rcu is
157
+ * ensuring that any fault handlers that might have seen
158
+ * dax_dev->alive == true, have completed. Any fault handlers
159
+ * that start after synchronize_rcu() has started will abort
160
+ * upon seeing dax_dev->alive == false.
161
+ */
162
+ dax_dev -> alive = false;
163
+ synchronize_rcu ();
164
+
153
165
get_device (dev );
154
166
device_unregister (dev );
155
167
ida_simple_remove (& dax_region -> ida , dax_dev -> id );
@@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
173
185
memcpy (dax_dev -> res , res , sizeof (* res ) * count );
174
186
dax_dev -> num_resources = count ;
175
187
kref_init (& dax_dev -> kref );
188
+ dax_dev -> alive = true;
176
189
dax_dev -> region = dax_region ;
177
190
kref_get (& dax_region -> kref );
178
191
@@ -217,9 +230,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
217
230
}
218
231
EXPORT_SYMBOL_GPL (devm_create_dax_dev );
219
232
233
+ /* return an unmapped area aligned to the dax region specified alignment */
234
+ static unsigned long dax_dev_get_unmapped_area (struct file * filp ,
235
+ unsigned long addr , unsigned long len , unsigned long pgoff ,
236
+ unsigned long flags )
237
+ {
238
+ unsigned long off , off_end , off_align , len_align , addr_align , align ;
239
+ struct dax_dev * dax_dev = filp ? filp -> private_data : NULL ;
240
+ struct dax_region * dax_region ;
241
+
242
+ if (!dax_dev || addr )
243
+ goto out ;
244
+
245
+ dax_region = dax_dev -> region ;
246
+ align = dax_region -> align ;
247
+ off = pgoff << PAGE_SHIFT ;
248
+ off_end = off + len ;
249
+ off_align = round_up (off , align );
250
+
251
+ if ((off_end <= off_align ) || ((off_end - off_align ) < align ))
252
+ goto out ;
253
+
254
+ len_align = len + align ;
255
+ if ((off + len_align ) < off )
256
+ goto out ;
257
+
258
+ addr_align = current -> mm -> get_unmapped_area (filp , addr , len_align ,
259
+ pgoff , flags );
260
+ if (!IS_ERR_VALUE (addr_align )) {
261
+ addr_align += (off - addr_align ) & (align - 1 );
262
+ return addr_align ;
263
+ }
264
+ out :
265
+ return current -> mm -> get_unmapped_area (filp , addr , len , pgoff , flags );
266
+ }
267
+
268
+ static int __match_devt (struct device * dev , const void * data )
269
+ {
270
+ const dev_t * devt = data ;
271
+
272
+ return dev -> devt == * devt ;
273
+ }
274
+
275
+ static struct device * dax_dev_find (dev_t dev_t )
276
+ {
277
+ return class_find_device (dax_class , NULL , & dev_t , __match_devt );
278
+ }
279
+
280
+ static int dax_dev_open (struct inode * inode , struct file * filp )
281
+ {
282
+ struct dax_dev * dax_dev = NULL ;
283
+ struct device * dev ;
284
+
285
+ dev = dax_dev_find (inode -> i_rdev );
286
+ if (!dev )
287
+ return - ENXIO ;
288
+
289
+ device_lock (dev );
290
+ dax_dev = dev_get_drvdata (dev );
291
+ if (dax_dev ) {
292
+ dev_dbg (dev , "%s\n" , __func__ );
293
+ filp -> private_data = dax_dev ;
294
+ kref_get (& dax_dev -> kref );
295
+ inode -> i_flags = S_DAX ;
296
+ }
297
+ device_unlock (dev );
298
+
299
+ if (!dax_dev ) {
300
+ put_device (dev );
301
+ return - ENXIO ;
302
+ }
303
+ return 0 ;
304
+ }
305
+
306
+ static int dax_dev_release (struct inode * inode , struct file * filp )
307
+ {
308
+ struct dax_dev * dax_dev = filp -> private_data ;
309
+ struct device * dev = dax_dev -> dev ;
310
+
311
+ dev_dbg (dax_dev -> dev , "%s\n" , __func__ );
312
+ dax_dev_put (dax_dev );
313
+ put_device (dev );
314
+
315
+ return 0 ;
316
+ }
317
+
318
+ static int check_vma (struct dax_dev * dax_dev , struct vm_area_struct * vma ,
319
+ const char * func )
320
+ {
321
+ struct dax_region * dax_region = dax_dev -> region ;
322
+ struct device * dev = dax_dev -> dev ;
323
+ unsigned long mask ;
324
+
325
+ if (!dax_dev -> alive )
326
+ return - ENXIO ;
327
+
328
+ /* prevent private / writable mappings from being established */
329
+ if ((vma -> vm_flags & (VM_NORESERVE |VM_SHARED |VM_WRITE )) == VM_WRITE ) {
330
+ dev_info (dev , "%s: %s: fail, attempted private mapping\n" ,
331
+ current -> comm , func );
332
+ return - EINVAL ;
333
+ }
334
+
335
+ mask = dax_region -> align - 1 ;
336
+ if (vma -> vm_start & mask || vma -> vm_end & mask ) {
337
+ dev_info (dev , "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n" ,
338
+ current -> comm , func , vma -> vm_start , vma -> vm_end ,
339
+ mask );
340
+ return - EINVAL ;
341
+ }
342
+
343
+ if ((dax_region -> pfn_flags & (PFN_DEV |PFN_MAP )) == PFN_DEV
344
+ && (vma -> vm_flags & VM_DONTCOPY ) == 0 ) {
345
+ dev_info (dev , "%s: %s: fail, dax range requires MADV_DONTFORK\n" ,
346
+ current -> comm , func );
347
+ return - EINVAL ;
348
+ }
349
+
350
+ if (!vma_is_dax (vma )) {
351
+ dev_info (dev , "%s: %s: fail, vma is not DAX capable\n" ,
352
+ current -> comm , func );
353
+ return - EINVAL ;
354
+ }
355
+
356
+ return 0 ;
357
+ }
358
+
359
+ static phys_addr_t pgoff_to_phys (struct dax_dev * dax_dev , pgoff_t pgoff ,
360
+ unsigned long size )
361
+ {
362
+ struct resource * res ;
363
+ phys_addr_t phys ;
364
+ int i ;
365
+
366
+ for (i = 0 ; i < dax_dev -> num_resources ; i ++ ) {
367
+ res = & dax_dev -> res [i ];
368
+ phys = pgoff * PAGE_SIZE + res -> start ;
369
+ if (phys >= res -> start && phys <= res -> end )
370
+ break ;
371
+ pgoff -= PHYS_PFN (resource_size (res ));
372
+ }
373
+
374
+ if (i < dax_dev -> num_resources ) {
375
+ res = & dax_dev -> res [i ];
376
+ if (phys + size - 1 <= res -> end )
377
+ return phys ;
378
+ }
379
+
380
+ return -1 ;
381
+ }
382
+
383
+ static int __dax_dev_fault (struct dax_dev * dax_dev , struct vm_area_struct * vma ,
384
+ struct vm_fault * vmf )
385
+ {
386
+ unsigned long vaddr = (unsigned long ) vmf -> virtual_address ;
387
+ struct device * dev = dax_dev -> dev ;
388
+ struct dax_region * dax_region ;
389
+ int rc = VM_FAULT_SIGBUS ;
390
+ phys_addr_t phys ;
391
+ pfn_t pfn ;
392
+
393
+ if (check_vma (dax_dev , vma , __func__ ))
394
+ return VM_FAULT_SIGBUS ;
395
+
396
+ dax_region = dax_dev -> region ;
397
+ if (dax_region -> align > PAGE_SIZE ) {
398
+ dev_dbg (dev , "%s: alignment > fault size\n" , __func__ );
399
+ return VM_FAULT_SIGBUS ;
400
+ }
401
+
402
+ phys = pgoff_to_phys (dax_dev , vmf -> pgoff , PAGE_SIZE );
403
+ if (phys == -1 ) {
404
+ dev_dbg (dev , "%s: phys_to_pgoff(%#lx) failed\n" , __func__ ,
405
+ vmf -> pgoff );
406
+ return VM_FAULT_SIGBUS ;
407
+ }
408
+
409
+ pfn = phys_to_pfn_t (phys , dax_region -> pfn_flags );
410
+
411
+ rc = vm_insert_mixed (vma , vaddr , pfn );
412
+
413
+ if (rc == - ENOMEM )
414
+ return VM_FAULT_OOM ;
415
+ if (rc < 0 && rc != - EBUSY )
416
+ return VM_FAULT_SIGBUS ;
417
+
418
+ return VM_FAULT_NOPAGE ;
419
+ }
420
+
421
+ static int dax_dev_fault (struct vm_area_struct * vma , struct vm_fault * vmf )
422
+ {
423
+ int rc ;
424
+ struct file * filp = vma -> vm_file ;
425
+ struct dax_dev * dax_dev = filp -> private_data ;
426
+
427
+ dev_dbg (dax_dev -> dev , "%s: %s: %s (%#lx - %#lx)\n" , __func__ ,
428
+ current -> comm , (vmf -> flags & FAULT_FLAG_WRITE )
429
+ ? "write" : "read" , vma -> vm_start , vma -> vm_end );
430
+ rcu_read_lock ();
431
+ rc = __dax_dev_fault (dax_dev , vma , vmf );
432
+ rcu_read_unlock ();
433
+
434
+ return rc ;
435
+ }
436
+
437
+ static int __dax_dev_pmd_fault (struct dax_dev * dax_dev ,
438
+ struct vm_area_struct * vma , unsigned long addr , pmd_t * pmd ,
439
+ unsigned int flags )
440
+ {
441
+ unsigned long pmd_addr = addr & PMD_MASK ;
442
+ struct device * dev = dax_dev -> dev ;
443
+ struct dax_region * dax_region ;
444
+ phys_addr_t phys ;
445
+ pgoff_t pgoff ;
446
+ pfn_t pfn ;
447
+
448
+ if (check_vma (dax_dev , vma , __func__ ))
449
+ return VM_FAULT_SIGBUS ;
450
+
451
+ dax_region = dax_dev -> region ;
452
+ if (dax_region -> align > PMD_SIZE ) {
453
+ dev_dbg (dev , "%s: alignment > fault size\n" , __func__ );
454
+ return VM_FAULT_SIGBUS ;
455
+ }
456
+
457
+ /* dax pmd mappings require pfn_t_devmap() */
458
+ if ((dax_region -> pfn_flags & (PFN_DEV |PFN_MAP )) != (PFN_DEV |PFN_MAP )) {
459
+ dev_dbg (dev , "%s: alignment > fault size\n" , __func__ );
460
+ return VM_FAULT_SIGBUS ;
461
+ }
462
+
463
+ pgoff = linear_page_index (vma , pmd_addr );
464
+ phys = pgoff_to_phys (dax_dev , pgoff , PAGE_SIZE );
465
+ if (phys == -1 ) {
466
+ dev_dbg (dev , "%s: phys_to_pgoff(%#lx) failed\n" , __func__ ,
467
+ pgoff );
468
+ return VM_FAULT_SIGBUS ;
469
+ }
470
+
471
+ pfn = phys_to_pfn_t (phys , dax_region -> pfn_flags );
472
+
473
+ return vmf_insert_pfn_pmd (vma , addr , pmd , pfn ,
474
+ flags & FAULT_FLAG_WRITE );
475
+ }
476
+
477
+ static int dax_dev_pmd_fault (struct vm_area_struct * vma , unsigned long addr ,
478
+ pmd_t * pmd , unsigned int flags )
479
+ {
480
+ int rc ;
481
+ struct file * filp = vma -> vm_file ;
482
+ struct dax_dev * dax_dev = filp -> private_data ;
483
+
484
+ dev_dbg (dax_dev -> dev , "%s: %s: %s (%#lx - %#lx)\n" , __func__ ,
485
+ current -> comm , (flags & FAULT_FLAG_WRITE )
486
+ ? "write" : "read" , vma -> vm_start , vma -> vm_end );
487
+
488
+ rcu_read_lock ();
489
+ rc = __dax_dev_pmd_fault (dax_dev , vma , addr , pmd , flags );
490
+ rcu_read_unlock ();
491
+
492
+ return rc ;
493
+ }
494
+
495
+ static void dax_dev_vm_open (struct vm_area_struct * vma )
496
+ {
497
+ struct file * filp = vma -> vm_file ;
498
+ struct dax_dev * dax_dev = filp -> private_data ;
499
+
500
+ dev_dbg (dax_dev -> dev , "%s\n" , __func__ );
501
+ kref_get (& dax_dev -> kref );
502
+ }
503
+
504
+ static void dax_dev_vm_close (struct vm_area_struct * vma )
505
+ {
506
+ struct file * filp = vma -> vm_file ;
507
+ struct dax_dev * dax_dev = filp -> private_data ;
508
+
509
+ dev_dbg (dax_dev -> dev , "%s\n" , __func__ );
510
+ dax_dev_put (dax_dev );
511
+ }
512
+
513
+ static const struct vm_operations_struct dax_dev_vm_ops = {
514
+ .fault = dax_dev_fault ,
515
+ .pmd_fault = dax_dev_pmd_fault ,
516
+ .open = dax_dev_vm_open ,
517
+ .close = dax_dev_vm_close ,
518
+ };
519
+
520
+ static int dax_dev_mmap (struct file * filp , struct vm_area_struct * vma )
521
+ {
522
+ struct dax_dev * dax_dev = filp -> private_data ;
523
+ int rc ;
524
+
525
+ dev_dbg (dax_dev -> dev , "%s\n" , __func__ );
526
+
527
+ rc = check_vma (dax_dev , vma , __func__ );
528
+ if (rc )
529
+ return rc ;
530
+
531
+ kref_get (& dax_dev -> kref );
532
+ vma -> vm_ops = & dax_dev_vm_ops ;
533
+ vma -> vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE ;
534
+ return 0 ;
535
+
536
+ }
537
+
220
538
static const struct file_operations dax_fops = {
221
539
.llseek = noop_llseek ,
222
540
.owner = THIS_MODULE ,
541
+ .open = dax_dev_open ,
542
+ .release = dax_dev_release ,
543
+ .get_unmapped_area = dax_dev_get_unmapped_area ,
544
+ .mmap = dax_dev_mmap ,
223
545
};
224
546
225
547
static int __init dax_init (void )
0 commit comments