Skip to content

Commit

Permalink
Implement SA based xattrs
Browse files Browse the repository at this point in the history
The current ZFS implementation stores xattrs on disk using a hidden
directory.  In this directory a file name represents the xattr name
and the file contexts are the xattr binary data.  This approach is
very flexible and allows for arbitrarily large xattrs.  However,
it also suffers from a significant performance penalty.  Accessing
a single xattr can requires up to three disk seeks.

  1) Lookup the dnode object.
  2) Lookup the dnodes's xattr directory object.
  3) Lookup the xattr object in the directory.

To avoid this performance penalty Linux filesystems such as ext3
and xfs try to store the xattr as part of the inode on disk.  When
the xattr is to large to store in the inode then a single external
block is allocated for them.  In practice most xattrs are small
and this approach works well.

The addition of System Attributes (SA) to zfs provides us a clean
way to make this optimization.  When the dataset property 'xattr=sa'
is set then xattrs will be preferentially stored as System Attributes.
This allows tiny xattrs (~100 bytes) to be stored with the dnode and
up to 64k of xattrs to be stored in the spill block.  If additional
xattr space is required, which is unlikely under Linux, they will be
stored using the traditional directory approach.

This optimization results in roughly a 3x performance improvement
when accessing xattrs which brings zfs roughly to parity with ext4
and xfs (see table below).  When multiple xattrs are stored per-file
the performance improvements are even greater because all of the
xattrs stored in the spill block will be cached.

However, by default SA based xattrs are disabled in the Linux port
to maximize compatibility with other implementations.  If you do
enable SA based xattrs then they will not be visible on platforms
which do not support this feature.

----------------------------------------------------------------------
   Time in seconds to get/set one xattr of N bytes on 100,000 files
------+--------------------------------+------------------------------
      |            setxattr            |            getxattr
bytes |  ext4     xfs zfs-dir  zfs-sa  |  ext4     xfs zfs-dir  zfs-sa
------+--------------------------------+------------------------------
1     |  2.33   31.88   21.50    4.57  |  2.35    2.64    6.29    2.43
32    |  2.79   30.68   21.98    4.60  |  2.44    2.59    6.78    2.48
256   |  3.25   31.99   21.36    5.92  |  2.32    2.71    6.22    3.14
1024  |  3.30   32.61   22.83    8.45  |  2.40    2.79    6.24    3.27
4096  |  3.57  317.46   22.52   10.73  |  2.78   28.62    6.90    3.94
16384 |   n/a 2342.39   34.30   19.20  |   n/a   45.44  145.90    7.55
65536 |   n/a 2941.39  128.15  131.32* |   n/a  141.92  256.85  262.12*

Legend:
* ext4      - Stock RHEL6.1 ext4 mounted with '-o user_xattr'.
* xfs       - Stock RHEL6.1 xfs mounted with default options.
* zfs-dir   - Directory based xattrs only.
* zfs-sa    - Prefer SAs but spill in to directories as needed, a
              trailing * indicates overflow in to directories occured.

NOTE: Ext4 supports 4096 bytes of xattr name/value pairs per file.
NOTE: XFS and ZFS have no limit on xattr name/value pairs per file.
NOTE: Linux limits individual name/value pairs to 65536 bytes.
NOTE: All setattr/getattr's were done after dropping the cache.
NOTE: All tests were run against a single hard drive.

Signed-off-by: Brian Behlendorf <[email protected]>
Issue #443
  • Loading branch information
behlendorf committed Nov 28, 2011
1 parent e89236f commit 82a3718
Show file tree
Hide file tree
Showing 12 changed files with 425 additions and 76 deletions.
5 changes: 5 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,11 @@ typedef enum {
ZFS_SYNC_DISABLED = 2
} zfs_sync_type_t;

typedef enum {
ZFS_XATTR_OFF = 0,
ZFS_XATTR_DIR = 1,
ZFS_XATTR_SA = 2
} zfs_xattr_type_t;

/*
* On-disk version number.
Expand Down
2 changes: 2 additions & 0 deletions include/sys/sa.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
boolean_t sa_enabled(objset_t *);
void sa_cache_init(void);
void sa_cache_fini(void);
void *sa_spill_alloc(int);
void sa_spill_free(void *);
int sa_set_sa_object(objset_t *, uint64_t);
int sa_hdrsize(void *);
void sa_handle_lock(sa_handle_t *);
Expand Down
9 changes: 9 additions & 0 deletions include/sys/zfs_sa.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ typedef enum zpl_attr {
ZPL_SYMLINK,
ZPL_SCANSTAMP,
ZPL_DACL_ACES,
ZPL_DXATTR,
ZPL_END
} zpl_attr_t;

Expand Down Expand Up @@ -126,12 +127,20 @@ typedef struct znode_phys {
} znode_phys_t;

#ifdef _KERNEL

#define DXATTR_MAX_ENTRY_SIZE (32768)
#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1)

int zfs_sa_readlink(struct znode *, uio_t *);
void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
int zfs_sa_get_xattr(struct znode *);
int zfs_sa_set_xattr(struct znode *);
void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
void zfs_sa_init(void);
void zfs_sa_fini(void);
#endif

#ifdef __cplusplus
Expand Down
1 change: 1 addition & 0 deletions include/sys/zfs_vfsops.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ typedef struct zfs_sb {
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
boolean_t z_use_sa; /* version allow system attributes */
boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
Expand Down
3 changes: 3 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ extern "C" {
#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
#define SA_ZPL_DXATTR(z) z->z_attr_table[ZPL_DXATTR]
#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]

/*
Expand Down Expand Up @@ -206,6 +207,8 @@ typedef struct znode {
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
zfs_acl_t *z_acl_cached; /* cached acl */
krwlock_t z_xattr_lock; /* xattr data lock */
nvlist_t *z_xattr_cached;/* cached xattrs */
list_node_t z_link_node; /* all znodes in fs link */
sa_handle_t *z_sa_hdl; /* handle to sa data */
boolean_t z_is_sa; /* are we native sa? */
Expand Down
2 changes: 1 addition & 1 deletion module/nvpair/nvpair_alloc_spl.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
static void *
nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size)
{
return (kmem_alloc(size, KM_SLEEP));
return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
}

static void *
Expand Down
17 changes: 12 additions & 5 deletions module/zcommon/zfs_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,14 @@ zfs_prop_init(void)
{ NULL }
};

static zprop_index_t xattr_table[] = {
{ "off", ZFS_XATTR_OFF },
{ "on", ZFS_XATTR_DIR },
{ "sa", ZFS_XATTR_SA },
{ "dir", ZFS_XATTR_DIR },
{ NULL }
};

/* inherit index properties */
zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
Expand Down Expand Up @@ -226,6 +234,9 @@ zfs_prop_init(void)
zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"latency | throughput", "LOGBIAS", logbias_table);
zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"on | off | dir | sa", "XATTR", xattr_table);

/* inherit index (boolean) properties */
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
Expand All @@ -244,12 +255,8 @@ zfs_prop_init(void)
boolean_table);
zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
boolean_table);
zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
boolean_table);
ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table);
zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
boolean_table);
Expand Down
28 changes: 24 additions & 4 deletions module/zfs/sa.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };

static int sa_legacy_attr_count = 16;
static kmem_cache_t *sa_cache = NULL;
static kmem_cache_t *spill_cache = NULL;

/*ARGSUSED*/
static int
Expand Down Expand Up @@ -232,13 +233,30 @@ sa_cache_init(void)
sa_cache = kmem_cache_create("sa_cache",
sizeof (sa_handle_t), 0, sa_cache_constructor,
sa_cache_destructor, NULL, NULL, NULL, 0);
spill_cache = kmem_cache_create("spill_cache",
SPA_MAXBLOCKSIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
}

void
sa_cache_fini(void)
{
if (sa_cache)
kmem_cache_destroy(sa_cache);

if (spill_cache)
kmem_cache_destroy(spill_cache);
}

void *
sa_spill_alloc(int flags)
{
return kmem_cache_alloc(spill_cache, flags);
}

void
sa_spill_free(void *obj)
{
kmem_cache_free(spill_cache, obj);
}

static int
Expand Down Expand Up @@ -1618,7 +1636,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
sa_bulk_attr_t *attr_desc;
void *old_data[2];
int bonus_attr_count = 0;
int bonus_data_size = 0, spill_data_size = 0;
int bonus_data_size = 0;
int spill_attr_count = 0;
int error;
uint16_t length;
Expand Down Expand Up @@ -1648,8 +1666,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
/* Bring spill buffer online if it isn't currently */

if ((error = sa_get_spill(hdl)) == 0) {
spill_data_size = hdl->sa_spill->db_size;
old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
ASSERT3U(hdl->sa_spill->db_size, <=, SPA_MAXBLOCKSIZE);
old_data[1] = sa_spill_alloc(KM_SLEEP);
bcopy(hdl->sa_spill->db_data, old_data[1],
hdl->sa_spill->db_size);
spill_attr_count =
Expand Down Expand Up @@ -1729,7 +1747,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
if (old_data[0])
kmem_free(old_data[0], bonus_data_size);
if (old_data[1])
kmem_free(old_data[1], spill_data_size);
sa_spill_free(old_data[1]);
kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);

return (error);
Expand Down Expand Up @@ -1998,6 +2016,8 @@ EXPORT_SYMBOL(sa_replace_all_by_template_locked);
EXPORT_SYMBOL(sa_enabled);
EXPORT_SYMBOL(sa_cache_init);
EXPORT_SYMBOL(sa_cache_fini);
EXPORT_SYMBOL(sa_spill_alloc);
EXPORT_SYMBOL(sa_spill_free);
EXPORT_SYMBOL(sa_set_sa_object);
EXPORT_SYMBOL(sa_hdrsize);
EXPORT_SYMBOL(sa_handle_lock);
Expand Down
80 changes: 80 additions & 0 deletions module/zfs/zfs_sa.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
{"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
{"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
{"ZPL_DACL_ACES", 0, SA_ACL, 0},
{"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0},
{NULL, 0, 0, 0}
};

Expand Down Expand Up @@ -183,6 +184,83 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
}
}

int
zfs_sa_get_xattr(znode_t *zp)
{
zfs_sb_t *zsb = ZTOZSB(zp);
char *obj;
int size;
int error;

ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
ASSERT(!zp->z_xattr_cached);
ASSERT(zp->z_is_sa);

error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), &size);
if (error) {
if (error == ENOENT)
return nvlist_alloc(&zp->z_xattr_cached,
NV_UNIQUE_NAME, KM_SLEEP);
else
return (error);
}

obj = sa_spill_alloc(KM_SLEEP);

error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size);
if (error == 0)
error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP);

sa_spill_free(obj);

return (error);
}

int
zfs_sa_set_xattr(znode_t *zp)
{
zfs_sb_t *zsb = ZTOZSB(zp);
dmu_tx_t *tx;
char *obj;
size_t size;
int error;

ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
ASSERT(zp->z_xattr_cached);
ASSERT(zp->z_is_sa);

error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR);
if (error)
goto out;

obj = sa_spill_alloc(KM_SLEEP);

error = nvlist_pack(zp->z_xattr_cached, &obj, &size,
NV_ENCODE_XDR, KM_SLEEP);
if (error)
goto out_free;

tx = dmu_tx_create(zsb->z_os);
dmu_tx_hold_sa_create(tx, size);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);

error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
error = sa_update(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb),
obj, size, tx);
if (error)
dmu_tx_abort(tx);
else
dmu_tx_commit(tx);
}
out_free:
sa_spill_free(obj);
out:
return (error);
}

/*
* I'm not convinced we should do any of this upgrade.
* since the SA code can read both old/new znode formats
Expand Down Expand Up @@ -338,6 +416,8 @@ EXPORT_SYMBOL(zfs_sa_readlink);
EXPORT_SYMBOL(zfs_sa_symlink);
EXPORT_SYMBOL(zfs_sa_get_scanstamp);
EXPORT_SYMBOL(zfs_sa_set_scanstamp);
EXPORT_SYMBOL(zfs_sa_get_xattr);
EXPORT_SYMBOL(zfs_sa_set_xattr);
EXPORT_SYMBOL(zfs_sa_upgrade);
EXPORT_SYMBOL(zfs_sa_upgrade_txholds);

Expand Down
16 changes: 13 additions & 3 deletions module/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,16 @@ xattr_changed_cb(void *arg, uint64_t newval)
{
zfs_sb_t *zsb = arg;

if (newval == TRUE)
zsb->z_flags |= ZSB_XATTR;
else
if (newval == ZFS_XATTR_OFF) {
zsb->z_flags &= ~ZSB_XATTR;
} else {
zsb->z_flags |= ZSB_XATTR;

if (newval == ZFS_XATTR_SA)
zsb->z_xattr_sa = B_TRUE;
else
zsb->z_xattr_sa = B_FALSE;
}
}

static void
Expand Down Expand Up @@ -641,6 +647,10 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
&sa_obj);
if (error)
goto out;

error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &zval);
if ((error == 0) && (zval == ZFS_XATTR_SA))
zsb->z_xattr_sa = B_TRUE;
} else {
/*
* Pre SA versions file systems should never touch
Expand Down
9 changes: 9 additions & 0 deletions module/zfs/zfs_znode.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,15 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);

mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));

zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
zp->z_xattr_cached = NULL;
zp->z_moved = 0;
return (0);
}
Expand All @@ -128,11 +130,13 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_parent_lock);
rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock);
rw_destroy(&zp->z_xattr_lock);
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);

ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
ASSERT(zp->z_xattr_cached == NULL);
}

void
Expand Down Expand Up @@ -272,6 +276,11 @@ zfs_inode_destroy(struct inode *ip)
zp->z_acl_cached = NULL;
}

if (zp->z_xattr_cached) {
nvlist_free(zp->z_xattr_cached);
zp->z_xattr_cached = NULL;
}

kmem_cache_free(znode_cache, zp);
}

Expand Down
Loading

0 comments on commit 82a3718

Please sign in to comment.