ext4: return 32/64-bit dir name hash according to usage type
authorFan Yong <yong.fan@whamcloud.com>
Mon, 19 Mar 2012 02:44:40 +0000 (22:44 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Mon, 19 Mar 2012 02:44:40 +0000 (22:44 -0400)
Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
and telldir().  However, this causes problems if there are 32-bit hash
collisions, since the NFSv2 server can get stuck resending the same
entries from the directory repeatedly.

Allow ext4 to return a full 64-bit hash (both major and minor) for
telldir to decrease the chance of hash collisions.  This still needs
integration on the NFS side.

Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
(blame me if something is not correct)

Signed-off-by: Fan Yong <yong.fan@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/hash.c

index 164c56092e5865a99238893c5717efc60a7aea4e..689d1b1a3f457ebde95ca5591a0081f66418d3c2 100644 (file)
@@ -32,24 +32,8 @@ static unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int ext4_readdir(struct file *, void *, filldir_t);
 static int ext4_dx_readdir(struct file *filp,
                           void *dirent, filldir_t filldir);
-static int ext4_release_dir(struct inode *inode,
-                               struct file *filp);
-
-const struct file_operations ext4_dir_operations = {
-       .llseek         = ext4_llseek,
-       .read           = generic_read_dir,
-       .readdir        = ext4_readdir,         /* we take BKL. needed?*/
-       .unlocked_ioctl = ext4_ioctl,
-#ifdef CONFIG_COMPAT
-       .compat_ioctl   = ext4_compat_ioctl,
-#endif
-       .fsync          = ext4_sync_file,
-       .release        = ext4_release_dir,
-};
-
 
 static unsigned char get_dtype(struct super_block *sb, int filetype)
 {
@@ -60,6 +44,26 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext4_filetype_table[filetype]);
 }
 
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which chould potentially get coverted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+       struct super_block *sb = inode->i_sb;
+
+       if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+                    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+           ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+            ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+               return 1;
+
+       return 0;
+}
+
 /*
  * Return 0 if the directory entry is OK, and 1 if there is a problem
  *
@@ -115,18 +119,13 @@ static int ext4_readdir(struct file *filp,
        unsigned int offset;
        int i, stored;
        struct ext4_dir_entry_2 *de;
-       struct super_block *sb;
        int err;
        struct inode *inode = filp->f_path.dentry->d_inode;
+       struct super_block *sb = inode->i_sb;
        int ret = 0;
        int dir_has_error = 0;
 
-       sb = inode->i_sb;
-
-       if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-                                   EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-           ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
-            ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+       if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
                        ret = err;
@@ -254,22 +253,134 @@ out:
        return ret;
 }
 
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+       return is_compat_task();
+#else
+       return (BITS_PER_LONG == 32);
+#endif
+}
+
 /*
  * These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
+ *
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
+ */
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+       if ((filp->f_mode & FMODE_32BITHASH) ||
+           (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+               return major >> 1;
+       else
+               return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+       if ((filp->f_mode & FMODE_32BITHASH) ||
+           (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+               return (pos << 1) & 0xffffffff;
+       else
+               return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+       if ((filp->f_mode & FMODE_32BITHASH) ||
+           (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+               return 0;
+       else
+               return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+       if ((filp->f_mode & FMODE_32BITHASH) ||
+           (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+               return EXT4_HTREE_EOF_32BIT;
+       else
+               return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() based on generic_file_llseek() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
  *
- * Currently we only use major hash numer.  This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie.  Sigh.
+ * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
+ *       will be invalid once the directory was converted into a dx directory
  */
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos)      ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos)      (0)
+loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       loff_t ret = -EINVAL;
+       int dx_dir = is_dx_dir(inode);
+
+       mutex_lock(&inode->i_mutex);
+
+       /* NOTE: relative offsets with dx directories might not work
+        *       as expected, as it is difficult to figure out the
+        *       correct offset between dx hashes */
+
+       switch (origin) {
+       case SEEK_END:
+               if (unlikely(offset > 0))
+                       goto out_err; /* not supported for directories */
+
+               /* so only negative offsets are left, does that have a
+                * meaning for directories at all? */
+               if (dx_dir)
+                       offset += ext4_get_htree_eof(file);
+               else
+                       offset += inode->i_size;
+               break;
+       case SEEK_CUR:
+               /*
+                * Here we special-case the lseek(fd, 0, SEEK_CUR)
+                * position-querying operation.  Avoid rewriting the "same"
+                * f_pos value back to the file because a concurrent read(),
+                * write() or lseek() might have altered it
+                */
+               if (offset == 0) {
+                       offset = file->f_pos;
+                       goto out_ok;
+               }
+
+               offset += file->f_pos;
+               break;
+       }
+
+       if (unlikely(offset < 0))
+               goto out_err;
+
+       if (!dx_dir) {
+               if (offset > inode->i_sb->s_maxbytes)
+                       goto out_err;
+       } else if (offset > ext4_get_htree_eof(file))
+               goto out_err;
+
+       /* Special lock needed here? */
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+
+out_ok:
+       ret = offset;
+out_err:
+       mutex_unlock(&inode->i_mutex);
+
+       return ret;
+}
 
 /*
  * This structure holds the nodes of the red-black tree used to store
@@ -330,15 +441,16 @@ static void free_rb_tree_fname(struct rb_root *root)
 }
 
 
-static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+                                                          loff_t pos)
 {
        struct dir_private_info *p;
 
        p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
        if (!p)
                return NULL;
-       p->curr_hash = pos2maj_hash(pos);
-       p->curr_minor_hash = pos2min_hash(pos);
+       p->curr_hash = pos2maj_hash(filp, pos);
+       p->curr_minor_hash = pos2min_hash(filp, pos);
        return p;
 }
 
@@ -429,7 +541,7 @@ static int call_filldir(struct file *filp, void *dirent,
                       "null fname?!?\n");
                return 0;
        }
-       curr_pos = hash2pos(fname->hash, fname->minor_hash);
+       curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
        while (fname) {
                error = filldir(dirent, fname->name,
                                fname->name_len, curr_pos,
@@ -454,13 +566,13 @@ static int ext4_dx_readdir(struct file *filp,
        int     ret;
 
        if (!info) {
-               info = ext4_htree_create_dir_info(filp->f_pos);
+               info = ext4_htree_create_dir_info(filp, filp->f_pos);
                if (!info)
                        return -ENOMEM;
                filp->private_data = info;
        }
 
-       if (filp->f_pos == EXT4_HTREE_EOF)
+       if (filp->f_pos == ext4_get_htree_eof(filp))
                return 0;       /* EOF */
 
        /* Some one has messed with f_pos; reset the world */
@@ -468,8 +580,8 @@ static int ext4_dx_readdir(struct file *filp,
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
-               info->curr_hash = pos2maj_hash(filp->f_pos);
-               info->curr_minor_hash = pos2min_hash(filp->f_pos);
+               info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+               info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
        }
 
        /*
@@ -501,7 +613,7 @@ static int ext4_dx_readdir(struct file *filp,
                        if (ret < 0)
                                return ret;
                        if (ret == 0) {
-                               filp->f_pos = EXT4_HTREE_EOF;
+                               filp->f_pos = ext4_get_htree_eof(filp);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
@@ -521,7 +633,7 @@ static int ext4_dx_readdir(struct file *filp,
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
-                               filp->f_pos = EXT4_HTREE_EOF;
+                               filp->f_pos = ext4_get_htree_eof(filp);
                                break;
                        }
                        info->curr_hash = info->next_hash;
@@ -540,3 +652,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
 
        return 0;
 }
+
+const struct file_operations ext4_dir_operations = {
+       .llseek         = ext4_dir_llseek,
+       .read           = generic_read_dir,
+       .readdir        = ext4_readdir,
+       .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = ext4_compat_ioctl,
+#endif
+       .fsync          = ext4_sync_file,
+       .release        = ext4_release_dir,
+};
index 513004fc3d840ee03586a4fedcdb133d8031c642..8b64a00502a0fee29921534175dd42a68d5adad7 100644 (file)
@@ -1612,7 +1612,11 @@ struct dx_hash_info
        u32             *seed;
 };
 
-#define EXT4_HTREE_EOF 0x7fffffff
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
+
 
 /*
  * Control parameters used by ext4_htree_next_block
index ac8f168c8ab435fcd868854cfdbc4d5d0787c6ad..fa8e4911d3545cdc78b1142cf74026785daa0840 100644 (file)
@@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                return -1;
        }
        hash = hash & ~1;
-       if (hash == (EXT4_HTREE_EOF << 1))
-               hash = (EXT4_HTREE_EOF-1) << 1;
+       if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+               hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
        hinfo->hash = hash;
        hinfo->minor_hash = minor_hash;
        return 0;