fio on ZFS

These are some fio runs to investigate the effects of different zfs dataset parameters. This page exists as my own reference as so often zfs ’tuning advice’ is copypasta’d without any real evidence.

I am primarily concerned with pure synchronous IOPS, as this condition is most telling of the underlying persistent media.

Unless otherwise stated, the pool is a stripe of two generic sandisk sata3 ssds, with primarycache=none, sync=always, ashift=12.

To ensure consistency and avoid configuration drift, the pool is dropped and recreated for each test.

Some test cases may have base parameters that differ from others, but in each case only a single variable changes.

Test system is Xeon W-1290, 4x2666mhz dimms, tdp cap at 50w.

ashift

The ‘common knowledge’ here appears true. Too small is ok, too large is not.

ashift=9 - 512b

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=9
sudo zfs set recordsize=4k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1416, BW=5664KiB/s (5800kB/s)(664MiB/120045msec)
  write: IOPS=1415, BW=5663KiB/s (5799kB/s)(664MiB/120045msec); 0 zone resets

ashift=12 - 4k

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1559, BW=6238KiB/s (6388kB/s)(731MiB/120005msec)
  write: IOPS=1559, BW=6237KiB/s (6387kB/s)(731MiB/120005msec); 0 zone resets

ashift=13 - 8k

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=13
sudo zfs set recordsize=4k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=884, BW=3538KiB/s (3623kB/s)(415MiB/120005msec)
  write: IOPS=884, BW=3538KiB/s (3623kB/s)(415MiB/120005msec); 0 zone resets

ashift=14 - 16k

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=14
sudo zfs set recordsize=4k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=747, BW=2988KiB/s (3060kB/s)(350MiB/120003msec)
  write: IOPS=747, BW=2990KiB/s (3062kB/s)(350MiB/120003msec); 0 zone resets

recordsize

Ideally recordsize is aligned with workload, but if not, it is not a huge deal.

recordsize=2k

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=2k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1267, BW=5068KiB/s (5190kB/s)(594MiB/120019msec)
  write: IOPS=1267, BW=5069KiB/s (5191kB/s)(594MiB/120019msec); 0 zone resets

recordsize=4k

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1621, BW=6488KiB/s (6643kB/s)(760MiB/120011msec)
  write: IOPS=1622, BW=6490KiB/s (6645kB/s)(761MiB/120011msec); 0 zone resets

recordsize=8k

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=8k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1219, BW=4876KiB/s (4993kB/s)(571MiB/120002msec)
  write: IOPS=1220, BW=4882KiB/s (5000kB/s)(572MiB/120002msec); 0 zone resets

atime

atime=off. Average 1562 iops.

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none atime=off stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1563, BW=6254KiB/s (6404kB/s)(733MiB/120006msec)
  write: IOPS=1563, BW=6255KiB/s (6405kB/s)(733MiB/120006msec); 0 zone resets
  read: IOPS=1607, BW=6432KiB/s (6586kB/s)(754MiB/120002msec)
  write: IOPS=1607, BW=6431KiB/s (6585kB/s)(754MiB/120002msec); 0 zone resets
  read: IOPS=1516, BW=6064KiB/s (6210kB/s)(711MiB/120009msec)
  write: IOPS=1515, BW=6063KiB/s (6208kB/s)(711MiB/120009msec); 0 zone resets

atime=on. Average 1548 iops.

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none atime=on stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1505, BW=6021KiB/s (6166kB/s)(706MiB/120002msec)
  write: IOPS=1504, BW=6018KiB/s (6163kB/s)(705MiB/120002msec); 0 zone resets
  read: IOPS=1528, BW=6115KiB/s (6262kB/s)(717MiB/120004msec)
  write: IOPS=1528, BW=6114KiB/s (6261kB/s)(716MiB/120004msec); 0 zone resets
  read: IOPS=1610, BW=6443KiB/s (6598kB/s)(755MiB/120001msec)
  write: IOPS=1610, BW=6443KiB/s (6598kB/s)(755MiB/120001msec); 0 zone resets

stripe / mirror

stripe

sudo zpool destroy stripe
sudo zpool create stripe /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1450, BW=5802KiB/s (5942kB/s)(680MiB/120007msec)
  write: IOPS=1450, BW=5803KiB/s (5943kB/s)(680MiB/120007msec); 0 zone resets

mirror

sudo zpool destroy stripe
sudo zpool create stripe mirror /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none stripe
sudo fio --filename=/stripe/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=758, BW=3036KiB/s (3109kB/s)(356MiB/120002msec)
  write: IOPS=759, BW=3039KiB/s (3112kB/s)(356MiB/120002msec); 0 zone resets

slog

single 5400rpm hdd

sudo zpool destroy pool
sudo zpool create pool /dev/sdb -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=35, BW=140KiB/s (143kB/s)(16.4MiB/120028msec)
  write: IOPS=36, BW=146KiB/s (149kB/s)(17.1MiB/120028msec); 0 zone resets

single 5400rpm hdd + single sata ssd slog

Interestingly enough with the slog, the hdd’s activity light clearly shows it flushing for a moment every 5 seconds.

sudo zpool destroy pool
sudo zpool create pool /dev/sdb log /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=92, BW=368KiB/s (377kB/s)(43.2MiB/120063msec)
  write: IOPS=94, BW=377KiB/s (386kB/s)(44.2MiB/120063msec); 0 zone resets

drive write-caching

single 5400rpm hdd + single sata ssd slog; ‘drive write-caching’ on:

sudo hdparm -W 1 /dev/sdb ; sudo hdparm -W 1 /dev/sdd
sudo zpool destroy pool
sudo zpool create pool /dev/sdb log /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=98, BW=395KiB/s (405kB/s)(46.3MiB/120063msec)
  write: IOPS=100, BW=404KiB/s (414kB/s)(47.3MiB/120063msec); 0 zone resets

single 5400rpm hdd + single sata ssd slog; ‘drive write-caching’ off:

sudo hdparm -W 0 /dev/sdb ; sudo hdparm -W 0 /dev/sdd
sudo zpool destroy pool
sudo zpool create pool /dev/sdb log /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=184, BW=737KiB/s (754kB/s)(86.4MiB/120037msec)
  write: IOPS=187, BW=750KiB/s (768kB/s)(87.9MiB/120037msec); 0 zone resets

single 5400rpm hdd; ‘drive write-caching’ on:

sudo hdparm -W 1 /dev/sdb
sudo zpool destroy pool
sudo zpool create pool /dev/sdb -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=35, BW=144KiB/s (147kB/s)(16.9MiB/120050msec)
  write: IOPS=37, BW=150KiB/s (153kB/s)(17.5MiB/120050msec); 0 zone resets

single 5400rpm hdd; ‘drive write-caching’ off:

sudo hdparm -W 0 /dev/sdb
sudo zpool destroy pool
sudo zpool create pool /dev/sdb -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=53, BW=213KiB/s (218kB/s)(24.0MiB/120068msec)
  write: IOPS=55, BW=223KiB/s (229kB/s)(26.2MiB/120068msec); 0 zone resets

single sata3 ssd; ‘drive write-caching’ on:

sudo hdparm -W 1 /dev/sdd
sudo zpool destroy pool
sudo zpool create pool /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1726, BW=6905KiB/s (7071kB/s)(809MiB/120008msec)
  write: IOPS=1725, BW=6901KiB/s (7066kB/s)(809MiB/120008msec); 0 zone resets

single sata3 ssd; ‘drive write-caching’ off:

sudo hdparm -W 0 /dev/sdd
sudo zpool destroy pool
sudo zpool create pool /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1323, BW=5294KiB/s (5421kB/s)(620MiB/120002msec)
  write: IOPS=1324, BW=5298KiB/s (5425kB/s)(621MiB/120002msec); 0 zone resets

two sata3 ssd stripe; ‘drive write-caching’ on:

sudo hdparm -W 1 /dev/sdb ; sudo hdparm -W 1 /dev/sdd
sudo zpool destroy pool
sudo zpool create pool /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1675, BW=6701KiB/s (6862kB/s)(785MiB/120002msec)
  write: IOPS=1674, BW=6700KiB/s (6861kB/s)(785MiB/120002msec); 0 zone resets

two sata3 ssd stripe; ‘drive write-caching’ off:

sudo hdparm -W 0 /dev/sdb ; sudo hdparm -W 0 /dev/sdd
sudo zpool destroy pool
sudo zpool create pool /dev/sdb /dev/sdd -o ashift=12
sudo zfs set recordsize=4k sync=always primarycache=none pool
sudo fio --filename=/pool/f --size=1GB --direct=1 --rw=randrw --bs=4k --runtime=120 --numjobs=8 --time_based --group_reporting --name=fio | grep IOPS
  read: IOPS=1541, BW=6168KiB/s (6316kB/s)(723MiB/120002msec)
  write: IOPS=1542, BW=6168KiB/s (6316kB/s)(723MiB/120002msec); 0 zone resets

Nathan Hensel

on caving, mountaineering, networking, computing, electronics


2023-05-04