From 074ca44283bf031678e67af7d82668bb03c55a55 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 6 Feb 2009 16:23:37 -0500 Subject: [PATCH 001/478] ext4: Remove stale block allocator references from ext4.h Remove some leftovers from when the old block allocator was removed (c2ea3fde). ext4_sb_info is now a bit lighter. Also remove a dangling read_block_bitmap() prototype. Signed-off-by: Mike Snitzer Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 ---------- fs/ext4/ext4_i.h | 3 --- fs/ext4/ext4_sb.h | 4 ---- fs/ext4/mballoc.h | 1 - 4 files changed, 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6083bb38057b..0db01421da3e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -32,14 +32,6 @@ */ #undef EXT4FS_DEBUG -/* - * Define EXT4_RESERVATION to reserve data blocks for expanding files - */ -#define EXT4_DEFAULT_RESERVE_BLOCKS 8 -/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ -#define EXT4_MAX_RESERVE_BLOCKS 1027 -#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0 - /* * Debug code */ @@ -54,8 +46,6 @@ #define ext4_debug(f, a...) do {} while (0) #endif -#define EXT4_MULTIBLOCK_ALLOCATOR 1 - /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 1 /* blocks already reserved */ diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index e69acc16f5c4..2d516c0a22af 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; -#define rsv_start rsv_window._rsv_start -#define rsv_end rsv_window._rsv_end - /* * storage for cached extent */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 039b6ea1a042..e318f486cc24 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -65,10 +65,6 @@ struct ext4_sb_info { struct blockgroup_lock s_blockgroup_lock; struct proc_dir_entry *s_proc; - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; - /* Journaling */ struct inode *s_journal_inode; struct journal_s *s_journal; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 10a2921baf14..7acf5ef24537 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -247,7 +247,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { From e187c6588d6ef3169db53c389b3de9dfde3b16cc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 6 Feb 2009 16:23:37 -0500 Subject: [PATCH 002/478] ext4: remove call to ext4_group_desc() in ext4_group_used_meta_blocks() The static function ext4_group_used_meta_blocks() only has one caller, who already has access to the block group's group descriptor. So it's better to have ext4_init_block_bitmap() pass the group descriptor to ext4_group_used_meta_blocks(), so it doesn't need to call ext4_group_desc(). Previously this function did not check if ext4_group_desc() returned NULL due to an error, potentially causing a kernel OOPS report. This avoids the issue entirely. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 38f40d55899c..b37b12875582 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, } static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group) + ext4_group_t block_group, + struct ext4_group_desc *gdp) { ext4_fsblk_t tmp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb, int used_blocks = sbi->s_itb_per_group + 2; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - struct ext4_group_desc *gdp; - struct buffer_head *bh; - - gdp = ext4_get_group_desc(sb, block_group, &bh); if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) used_blocks--; @@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); } - return free_blocks - ext4_group_used_meta_blocks(sb, block_group); + return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); } From 8bad4597c2d71365adfa846ea1ca6cf99161a455 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Feb 2009 21:46:54 -0500 Subject: [PATCH 003/478] ext4: Use unsigned int for blocksize in dx_make_map() and dx_pack_dirents() Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 83410244d3ee..04824958cba5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name, struct dx_frame *frame, int *err); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct ext4_dir_entry_2 *de, int size, +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, struct dx_map_entry *offsets, int count); -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, @@ -713,15 +713,15 @@ errout: * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map (struct ext4_dir_entry_2 *de, int size, - struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail) { int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; - while ((char *) de < base + size) - { + while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { ext4fs_dirhash(de->name, de->name_len, &h); map_tail--; @@ -1130,13 +1130,13 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; - while ((char*)de < base + size) { + while ((char*)de < base + blocksize) { next = ext4_next_entry(de); if (de->inode && de->name_len) { rec_len = EXT4_DIR_REC_LEN(de->name_len); From 3d0518f4758eca4339e75e5b9dbb7e06a5ce08b4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 14 Feb 2009 23:01:36 -0500 Subject: [PATCH 004/478] ext4: New rec_len encoding for very large blocksizes The rec_len field in the directory entry is 16 bits, so to encode blocksizes larger than 64k becomes problematic. This patch allows us to supprot block sizes up to 256k, by using the low 2 bits to extend the range of rec_len to 2**18-1 (since valid rec_len sizes must be a multiple of 4). We use the convention that a rec_len of 0 or 65535 means the filesystem block size, for compatibility with older kernels. It's unlikely we'll see VM pages of up to 256k, but at some point we might find that the Linux VM has been enhanced to support filesystem block sizes > than the VM page size, at which point it might be useful for some applications to allow very large filesystem block sizes. Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 16 +++--- fs/ext4/ext4.h | 21 ++------ fs/ext4/namei.c | 130 +++++++++++++++++++++++++++++++----------------- 3 files changed, 98 insertions(+), 69 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2df2e40b01af..b64789929a65 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, unsigned int offset) { const char *error_msg = NULL; - const int rlen = ext4_rec_len_from_disk(de->rec_len); + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); if (rlen < EXT4_DIR_REC_LEN(1)) error_msg = "rec_len is smaller than minimal"; @@ -178,10 +179,11 @@ revalidate: * least that it is non-zero. A * failure will be detected in the * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len) - < EXT4_DIR_REC_LEN(1)) + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) break; - i += ext4_rec_len_from_disk(de->rec_len); + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); } offset = i; filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) @@ -203,7 +205,8 @@ revalidate: ret = stored; goto out; } - offset += ext4_rec_len_from_disk(de->rec_len); + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is @@ -225,7 +228,8 @@ revalidate: goto revalidate; stored++; } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len); + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); } offset = 0; brelse(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0db01421da3e..d5193b55ca94 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -855,24 +855,6 @@ struct ext4_dir_entry_2 { ~EXT4_DIR_ROUND) #define EXT4_MAX_REC_LEN ((1<<16)-1) -static inline unsigned ext4_rec_len_from_disk(__le16 dlen) -{ - unsigned len = le16_to_cpu(dlen); - - if (len == EXT4_MAX_REC_LEN || len == 0) - return 1 << 16; - return len; -} - -static inline __le16 ext4_rec_len_to_disk(unsigned len) -{ - if (len == (1 << 16)) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); - return cpu_to_le16(len); -} - /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 @@ -1097,7 +1079,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); + /* namei.c */ +extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize); +extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 04824958cba5..a5ba1a858094 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -165,7 +165,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, - struct dx_map_entry *offsets, int count); + struct dx_map_entry *offsets, int count, unsigned blocksize); static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); @@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); +unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +} + +__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +} + /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * -ext4_next_entry(struct ext4_dir_entry_2 *p) +ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) { return (struct ext4_dir_entry_2 *)((char *)p + - ext4_rec_len_from_disk(p->rec_len)); + ext4_rec_len_from_disk(p->rec_len, blocksize)); } /* @@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent space += EXT4_DIR_REC_LEN(de->name_len); names++; } - de = ext4_next_entry(de); + de = ext4_next_entry(de, size); } printk("(%i)\n", names); return (struct stats) { names, space, 1 }; @@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de)) { + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<i_sb)) +((char *)de - bh->b_data))) { @@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - de = ext4_next_entry(de); + de = ext4_next_entry(de, dir->i_sb->s_blocksize); if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) goto errout; count++; @@ -732,7 +756,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, cond_resched(); } /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext4_next_entry(de); + de = ext4_next_entry(de, blocksize); } return count; } @@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh, return 1; } /* prevent looping on a bad block */ - de_len = ext4_rec_len_from_disk(de->rec_len); + de_len = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); if (de_len <= 0) return -1; offset += de_len; @@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de)) { + for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) { int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + ((char *) de - bh->b_data); @@ -1109,7 +1134,8 @@ static inline void ext4_set_de_type(struct super_block *sb, * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, + unsigned blocksize) { unsigned rec_len = 0; @@ -1118,7 +1144,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) rec_len = EXT4_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = - ext4_rec_len_to_disk(rec_len); + ext4_rec_len_to_disk(rec_len, blocksize); de->inode = 0; map++; to += rec_len; @@ -1137,12 +1163,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) prev = to = de; while ((char*)de < base + blocksize) { - next = ext4_next_entry(de); + next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { rec_len = EXT4_DIR_REC_LEN(de->name_len); if (de > to) memmove(to, de, rec_len); - to->rec_len = ext4_rec_len_to_disk(rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } @@ -1215,10 +1241,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split); + de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); - de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, + blocksize); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1268,6 +1296,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int offset = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; unsigned short reclen; int nlen, rlen, err; char *top; @@ -1275,7 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + dir->i_sb->s_blocksize - reclen; + top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, bh, offset)) { @@ -1287,7 +1316,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, return -EEXIST; } nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if ((de->inode? rlen - nlen: rlen) >= reclen) break; de = (struct ext4_dir_entry_2 *)((char *)de + rlen); @@ -1306,11 +1335,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, /* By now the buffer is marked for journaling */ nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if (de->inode) { struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); - de->rec_len = ext4_rec_len_to_disk(nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); + de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); de = de1; } de->file_type = EXT4_FT_UNKNOWN; @@ -1380,7 +1409,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + - ext4_rec_len_from_disk(fde->rec_len)); + ext4_rec_len_from_disk(fde->rec_len, blocksize)); if ((char *) de >= (((char *) root) + blocksize)) { ext4_error(dir->i_sb, __func__, "invalid rec_len for '..' in inode %lu", @@ -1402,12 +1431,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, memcpy (data1, de, len); de = (struct ext4_dir_entry_2 *) data1; top = data1 + len; - while ((char *)(de2 = ext4_next_entry(de)) < top) + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) de = de2; - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), + blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; @@ -1488,7 +1519,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(blocksize); + de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -1551,7 +1582,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; - node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, + sb->s_blocksize); node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, frame->bh); @@ -1639,6 +1671,7 @@ static int ext4_delete_entry(handle_t *handle, struct buffer_head *bh) { struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; int i; i = 0; @@ -1652,8 +1685,11 @@ static int ext4_delete_entry(handle_t *handle, ext4_journal_get_write_access(handle, bh); if (pde) pde->rec_len = ext4_rec_len_to_disk( - ext4_rec_len_from_disk(pde->rec_len) + - ext4_rec_len_from_disk(de->rec_len)); + ext4_rec_len_from_disk(pde->rec_len, + blocksize) + + ext4_rec_len_from_disk(de->rec_len, + blocksize), + blocksize); else de->inode = 0; dir->i_version++; @@ -1661,9 +1697,9 @@ static int ext4_delete_entry(handle_t *handle, ext4_handle_dirty_metadata(handle, dir, bh); return 0; } - i += ext4_rec_len_from_disk(de->rec_len); + i += ext4_rec_len_from_disk(de->rec_len, blocksize); pde = de; - de = ext4_next_entry(de); + de = ext4_next_entry(de, blocksize); } return -ENOENT; } @@ -1793,6 +1829,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct inode *inode; struct buffer_head *dir_block; struct ext4_dir_entry_2 *de; + unsigned int blocksize = dir->i_sb->s_blocksize; int err, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) @@ -1824,13 +1861,14 @@ retry: de = (struct ext4_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); strcpy(de->name, "."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext4_next_entry(de); + de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - - EXT4_DIR_REC_LEN(1)); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), + blocksize); de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); @@ -1885,7 +1923,7 @@ static int empty_dir(struct inode *inode) return 1; } de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de); + de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || !le32_to_cpu(de1->inode) || strcmp(".", de->name) || @@ -1896,9 +1934,9 @@ static int empty_dir(struct inode *inode) brelse(bh); return 1; } - offset = ext4_rec_len_from_disk(de->rec_len) + - ext4_rec_len_from_disk(de1->rec_len); - de = ext4_next_entry(de1); + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + + ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); + de = ext4_next_entry(de1, sb->s_blocksize); while (offset < inode->i_size) { if (!bh || (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { @@ -1927,8 +1965,8 @@ static int empty_dir(struct inode *inode) brelse(bh); return 0; } - offset += ext4_rec_len_from_disk(de->rec_len); - de = ext4_next_entry(de); + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); return 1; @@ -2297,8 +2335,8 @@ retry: return err; } -#define PARENT_INO(buffer) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) +#define PARENT_INO(buffer, size) \ + (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) /* * Anybody can rename anything with this: the permission checks are left to the @@ -2358,7 +2396,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; - if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + if (le32_to_cpu(PARENT_INO(dir_bh->b_data, + old_dir->i_sb->s_blocksize)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && @@ -2430,7 +2469,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); ext4_journal_get_write_access(handle, dir_bh); - PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = + cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, old_dir, dir_bh); ext4_dec_count(handle, old_dir); From 705895b61133ef43d106fe6a6bbdb2eec923867e Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 15 Feb 2009 18:07:52 -0500 Subject: [PATCH 005/478] ext4: allocate ->s_blockgroup_lock separately As spotted by kmemtrace, struct ext4_sb_info is 17664 bytes on 64-bit which makes it a very bad fit for SLAB allocators. The culprit of the wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when NR_CPUS >= 32. To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2 page in the worst case, separately. This shinks down struct ext4_sb_info enough to fit a 2 KB slab cache so now we allocate 16 KB + 2 KB instead of 32 KB saving 14 KB of memory. Acked-by: Andreas Dilger Signed-off-by: Pekka Enberg Cc: Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_sb.h | 4 ++-- fs/ext4/super.c | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index e318f486cc24..4e4d9cc3f40d 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -62,7 +62,7 @@ struct ext4_sb_info { struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyblocks_counter; - struct blockgroup_lock s_blockgroup_lock; + struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; /* Journaling */ @@ -149,7 +149,7 @@ struct ext4_sb_info { static inline spinlock_t * sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) { - return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); } #endif /* _EXT4_SB */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f7371a6a923d..a3768709ce05 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -615,6 +615,7 @@ static void ext4_put_super(struct super_block *sb) ext4_blkdev_remove(sbi); } sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return; } @@ -2021,6 +2022,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; sbi->s_resuid = EXT4_DEF_RESUID; @@ -2332,7 +2340,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) &sbi->s_inode_readahead_blks); #endif - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); From 8fa43a81b97853fc69417bb6054182e78f95cbeb Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Sun, 15 Feb 2009 18:57:26 -0500 Subject: [PATCH 006/478] ext4: don't inherit inappropriate inode flags from parent At present INDEX and EXTENTS are the only flags that new ext4 inodes do NOT inherit from their parent. In addition prevent the flags DIRTY, ECOMPR, IMAGIC, TOPDIR, HUGE_FILE and EXT_MIGRATE from being inherited. List inheritable flags explicitly to prevent future flags from accidentally being inherited. This fixes the TOPDIR flag inheritance bug reported at http://bugzilla.kernel.org/show_bug.cgi?id=9866. Signed-off-by: Duane Griffin Acked-by: Andreas Dilger Cc: Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +++++++ fs/ext4/ialloc.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d5193b55ca94..d4700d101ea7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -239,6 +239,13 @@ struct flex_groups { #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ + EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + /* * Inode dynamic state flags */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index fb51b40e3e8f..1ff3df086e58 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -889,7 +889,7 @@ got: * newly created directory and file only if -o extent mount option is * specified */ - ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); + ei->i_flags = EXT4_I(dir)->i_flags & EXT4_FL_INHERITED; if (S_ISLNK(mode)) ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); /* dirsync only applies to directories */ From 2dc6b0d48ca0599837df21b14bb8393d0804af57 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Sun, 15 Feb 2009 18:09:20 -0500 Subject: [PATCH 007/478] ext4: tighten restrictions on inode flags At the moment there are few restrictions on which flags may be set on which inodes. Specifically DIRSYNC may only be set on directories and IMMUTABLE and APPEND may not be set on links. Tighten that to disallow TOPDIR being set on non-directories and only NODUMP and NOATIME to be set on non-regular file, non-directories. Introduces a flags masking function which masks flags based on mode and use it during inode creation and when flags are set via the ioctl to facilitate future consistency. Signed-off-by: Duane Griffin Acked-by: Andreas Dilger Cc: Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 17 +++++++++++++++++ fs/ext4/ialloc.c | 14 +++++--------- fs/ext4/ioctl.c | 3 +-- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d4700d101ea7..2cbfc0b04d37 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -246,6 +246,23 @@ struct flex_groups { EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + /* * Inode dynamic state flags */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1ff3df086e58..ae3eb57dccdd 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -885,16 +885,12 @@ got: ei->i_disksize = 0; /* - * Don't inherit extent flag from directory. We set extent flag on - * newly created directory and file only if -o extent mount option is - * specified + * Don't inherit extent flag from directory, amongst others. We set + * extent flag on newly created directory and file only if -o extent + * mount option is specified */ - ei->i_flags = EXT4_I(dir)->i_flags & EXT4_FL_INHERITED; - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); - /* dirsync only applies to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT4_DIRSYNC_FL; + ei->i_flags = + ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 42dc83fb247a..22dd29f3ebc9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT4_DIRSYNC_FL; + flags = ext4_mask_flags(inode->i_mode, flags); err = -EPERM; mutex_lock(&inode->i_mutex); From a4912123b688e057084e6557cef8924f7ae5bbde Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 12 Mar 2009 12:18:34 -0400 Subject: [PATCH 008/478] ext4: New inode/block allocation algorithms for flex_bg filesystems The find_group_flex() inode allocator is now only used if the filesystem is mounted using the "oldalloc" mount option. It is replaced with the original Orlov allocator that has been updated for flex_bg filesystems (it should behave the same way if flex_bg is disabled). The inode allocator now functions by taking into account each flex_bg group, instead of each block group, when deciding whether or not it's time to allocate a new directory into a fresh flex_bg. The block allocator has also been changed so that the first block group in each flex_bg is preferred for use for storing directory blocks. This keeps directory blocks close together, which is good for speeding up e2fsck since large directories are more likely to look like this: debugfs: stat /home/tytso/Maildir/cur Inode: 1844562 Type: directory Mode: 0700 Flags: 0x81000 Generation: 1132745781 Version: 0x00000000:0000ad71 User: 15806 Group: 15806 Size: 1060864 File ACL: 0 Directory ACL: 0 Links: 2 Blockcount: 2072 Fragment: Address: 0 Number: 0 Size: 0 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009 crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009 Size of extra inode fields: 28 BLOCKS: (0):7348651, (1-258):7348654-7348911 TOTAL: 259 Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 ++ fs/ext4/ext4_i.h | 3 + fs/ext4/extents.c | 25 +++++- fs/ext4/ialloc.c | 213 ++++++++++++++++++++++++++++++++++------------ fs/ext4/inode.c | 18 +++- fs/ext4/mballoc.c | 7 ++ 6 files changed, 215 insertions(+), 57 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2cbfc0b04d37..096456c8559b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -827,6 +827,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + /* * Structure of a directory entry */ diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 2d516c0a22af..4ce2187123aa 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -122,6 +122,9 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* ialloc */ + ext4_group_t i_last_alloc_group; + /* allocation reservation info for delalloc */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0aa4fe4f596..aa3431856c9a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ae3eb57dccdd..617f5a2d800a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -410,6 +410,43 @@ out: return 0; } +struct orlov_stats { + __u32 free_inodes; + __u32 free_blocks; + __u32 used_dirs; +}; + +/* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg. If flex_size is 1, then g + * is a block group number; otherwise it is flex_bg number. + */ +void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + int i; + + stats->free_inodes = 0; + stats->free_blocks = 0; + stats->used_dirs = 0; + + g *= flex_size; + + for (i = 0; i < flex_size; i++) { + if (g >= ngroups) + break; + desc = ext4_get_group_desc(sb, g++, NULL); + if (!desc) + continue; + + stats->free_inodes += ext4_free_inodes_count(sb, desc); + stats->free_blocks += ext4_free_blks_count(sb, desc); + stats->used_dirs += ext4_used_dirs_count(sb, desc); + } +} + /* * Orlov's allocator for directories. * @@ -425,35 +462,34 @@ out: * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). - * - * Debt is incremented each time we allocate a directory and decremented - * when we allocate an inode, within 0--255. */ -#define INODE_COST 64 -#define BLOCK_COST 256 - static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group) + ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; ext4_group_t ngroups = sbi->s_groups_count; int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; - ext4_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i; + ext4_group_t i, grp, g; struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + + if (flex_size > 1) { + ngroups = (ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; @@ -462,71 +498,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, do_div(avefreeb, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { + if (S_ISDIR(mode) && + ((parent == sb->s_root->d_inode) || + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { int best_ndir = inodes_per_group; - ext4_group_t grp; int ret = -1; get_random_bytes(&grp, sizeof(grp)); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { - grp = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, grp, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) continue; - if (ext4_used_dirs_count(sb, desc) >= best_ndir) + if (stats.used_dirs >= best_ndir) continue; - if (ext4_free_inodes_count(sb, desc) < avefreei) + if (stats.free_inodes < avefreei) continue; - if (ext4_free_blks_count(sb, desc) < avefreeb) + if (stats.free_blocks < avefreeb) continue; - *group = grp; + grp = g; ret = 0; - best_ndir = ext4_used_dirs_count(sb, desc); + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + /* + * We pack inodes at the beginning of the flexgroup's + * inode tables. Block allocation decisions will do + * something similar, although regular files will + * start at 2nd block group of the flexgroup. See + * ext4_ext_find_goal() and ext4_find_near(). + */ + grp *= flex_size; + for (i = 0; i < flex_size; i++) { + if (grp+i >= sbi->s_groups_count) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } } - if (ret == 0) - return ret; goto fallback; } - blocks_per_dir = ext4_blocks_count(es) - freeb; - do_div(blocks_per_dir, ndirs); - max_dirs = ndirs / ngroups + inodes_per_group / 16; - min_inodes = avefreei - inodes_per_group / 4; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; - max_debt = EXT4_BLOCKS_PER_GROUP(sb); - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; + /* + * Start looking in the flex group where we last allocated an + * inode for this parent directory + */ + if (EXT4_I(parent)->i_last_alloc_group != ~0) { + parent_group = EXT4_I(parent)->i_last_alloc_group; + if (flex_size > 1) + parent_group >>= sbi->s_log_groups_per_flex; + } for (i = 0; i < ngroups; i++) { - *group = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, *group, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) continue; - if (ext4_used_dirs_count(sb, desc) >= max_dirs) + if (stats.free_inodes < min_inodes) continue; - if (ext4_free_inodes_count(sb, desc) < min_inodes) + if (stats.free_blocks < min_blocks) continue; - if (ext4_free_blks_count(sb, desc) < min_blocks) - continue; - return 0; + goto found_flex_bg; } fallback: + ngroups = sbi->s_groups_count; + avefreei = freei / ngroups; + parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { - *group = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, *group, NULL); + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) + ext4_free_inodes_count(sb, desc) >= avefreei) { + *group = grp; return 0; + } } if (avefreei) { @@ -542,12 +604,51 @@ fallback: } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group) + ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *desc; - ext4_group_t i; + ext4_group_t i, last; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * Try to place the inode is the same flex group as its + * parent. If we can't find space, use the Orlov algorithm to + * find another flex group, and store that information in the + * parent directory's inode information so that use that flex + * group for future allocations. + */ + if (flex_size > 1) { + int retry = 0; + + try_again: + parent_group &= ~(flex_size-1); + last = parent_group + flex_size; + if (last > ngroups) + last = ngroups; + for (i = parent_group; i < last; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search algorithm + * to find a new flex group; we pass in the mode to + * avoid the topdir algorithms. + */ + *group = parent_group + flex_size; + if (*group > ngroups) + *group = 0; + return find_group_orlov(sb, parent, group, mode); + } /* * Try to place the inode in its parent directory @@ -716,10 +817,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) sbi = EXT4_SB(sb); es = sbi->s_es; - if (sbi->s_log_groups_per_flex) { + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group); + ret2 = find_group_other(sb, dir, &group, mode); if (ret2 == 0 && once) once = 0; printk(KERN_NOTICE "ext4: find_group_flex " @@ -733,11 +834,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else - ret2 = find_group_orlov(sb, dir, &group); + ret2 = find_group_orlov(sb, dir, &group, mode); } else - ret2 = find_group_other(sb, dir, &group); + ret2 = find_group_other(sb, dir, &group, mode); got_group: + EXT4_I(dir)->i_last_alloc_group = group; err = -ENOSPC; if (ret2 == -1) goto out; @@ -894,6 +996,7 @@ got: ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; + ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71d3ecd5db79..25811507d2b0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { @@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ - bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -4287,6 +4302,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; + ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b038188bd039..b0d6022eaa67 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1726,6 +1726,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, { unsigned free, fragments; unsigned i, bits; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_desc *desc; struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); @@ -1747,6 +1748,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) return 0; + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) + return 0; + bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) From e6f009b0b45220c004672d41a58865e94946104d Mon Sep 17 00:00:00 2001 From: Bryan Donlan Date: Sun, 22 Feb 2009 21:20:25 -0500 Subject: [PATCH 009/478] ext4: return -EIO not -ESTALE on directory traversal through deleted inode ext4_iget() returns -ESTALE if invoked on a deleted inode, in order to report errors to NFS properly. However, in ext4_lookup(), this -ESTALE can be propagated to userspace if the filesystem is corrupted such that a directory entry references a deleted inode. This leads to a misleading error message - "Stale NFS file handle" - and confusion on the part of the admin. The bug can be easily reproduced by creating a new filesystem, making a link to an unused inode using debugfs, then mounting and attempting to ls -l said link. This patch thus changes ext4_lookup to return -EIO if it receives -ESTALE from ext4_iget(), as ext4 does for other filesystem metadata corruption; and also invokes the appropriate ext*_error functions when this case is detected. Signed-off-by: Bryan Donlan Cc: Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a5ba1a858094..6e1ad68cdc7a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1077,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (unlikely(IS_ERR(inode))) { + if (PTR_ERR(inode) == -ESTALE) { + ext4_error(dir->i_sb, __func__, + "deleted inode referenced: %u", + ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } } return d_splice_alias(inode, dentry); } From 56b19868aca856a7d7bf20c3a7a1030e4fd75b2b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Mar 2009 09:51:20 -0400 Subject: [PATCH 010/478] ext4: Add checks to validate extent entries. This patch adds checks to validate the extent entries along with extent headers, to avoid crashes caused by corrupt filesystems. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 81 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aa3431856c9a..ee40b7c52c8c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -324,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth) return max; } -static int __ext4_ext_check_header(const char *function, struct inode *inode, +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + if (unlikely(block < le32_to_cpu(es->s_first_data_block) || + ((block + len) > ext4_blocks_count(es)))) + return 0; + else + return 1; +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = idx_pblock(ext_idx); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + if (unlikely(block < le32_to_cpu(es->s_first_data_block) || + (block > ext4_blocks_count(es)))) + return 0; + else + return 1; +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + struct ext4_extent *ext; + struct ext4_extent_idx *ext_idx; + unsigned short entries; + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + ext = EXT_FIRST_EXTENT(eh); + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + ext++; + entries--; + } + } else { + ext_idx = EXT_FIRST_INDEX(eh); + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + ext_idx++; + entries--; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, struct inode *inode, struct ext4_extent_header *eh, int depth) { @@ -352,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode, error_msg = "invalid eh_entries"; goto corrupted; } + if (!ext4_valid_extent_entries(inode, eh, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } return 0; corrupted: ext4_error(inode->i_sb, function, - "bad header in inode #%lu: %s - magic %x, " + "bad header/extent in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), @@ -365,8 +426,8 @@ corrupted: return -EIO; } -#define ext4_ext_check_header(inode, eh, depth) \ - __ext4_ext_check_header(__func__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth) \ + __ext4_ext_check(__func__, inode, eh, depth) #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -570,7 +631,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); - if (ext4_ext_check_header(inode, eh, depth)) + if (ext4_ext_check(inode, eh, depth)) return ERR_PTR(-EIO); @@ -607,7 +668,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (ext4_ext_check_header(inode, eh, i)) + if (ext4_ext_check(inode, eh, i)) goto err; } @@ -1204,7 +1265,7 @@ got_index: return -EIO; eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -1217,7 +1278,7 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -2160,7 +2221,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); - if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } @@ -2214,7 +2275,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) err = -EIO; break; } - if (ext4_ext_check_header(inode, ext_block_hdr(bh), + if (ext4_ext_check(inode, ext_block_hdr(bh), depth - i - 1)) { err = -EIO; break; From 7a262f7c69163cd4811f2f838faef5c5b18439c9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 27 Mar 2009 16:39:58 -0400 Subject: [PATCH 011/478] ext4: Validate extent details only when read from the disk Make sure we validate extent details only when read from the disk. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Thiemo Nagel Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 1 + fs/ext4/extents.c | 25 ++++++++++++++++++------- fs/ext4/inode.c | 10 ++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 18cb67b2cbbc..f0c3ec85bd48 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ee40b7c52c8c..ac77d8b8251d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -429,6 +429,11 @@ corrupted: #define ext4_ext_check(inode, eh, depth) \ __ext4_ext_check(__func__, inode, eh, depth) +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); +} + #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { @@ -631,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); - if (ext4_ext_check(inode, eh, depth)) - return ERR_PTR(-EIO); - /* account possible depth increase */ if (!path) { @@ -649,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { + int need_to_validate = 0; + ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -657,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_bread(inode->i_sb, path[ppos].p_block); - if (!bh) + bh = sb_getblk(inode->i_sb, path[ppos].p_block); + if (unlikely(!bh)) goto err; - + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto err; + } + /* validate the extent entries */ + need_to_validate = 1; + } eh = ext_block_hdr(bh); ppos++; BUG_ON(ppos > depth); @@ -668,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (ext4_ext_check(inode, eh, i)) + if (need_to_validate && ext4_ext_check(inode, eh, i)) goto err; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 25811507d2b0..cd0399db0ef1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4345,6 +4345,16 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } + if (ei->i_flags & EXT4_EXTENTS_FL) { + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + if (ret) { + brelse(bh); + goto bad_inode; + } + + } + if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; From 722bde6875bfb49a0c84e5601eb82dd7ac02d27c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Feb 2009 00:51:57 -0500 Subject: [PATCH 012/478] ext4: Add fine print for the 32000 subdirectory limit Some poeple are reading the ext4 feature list too literally and create dubious test cases involving very long filenames and 1k blocksize and then complain when they run into an htree-imposed limit. So add fine print to the "fix 32000 subdirectory limit" ext4 feature. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index cec829bc7291..5c484aec2bab 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be * extent format more robust in face of on-disk corruption due to magics, * internal redundancy in tree * improved file allocation (multi-block alloc) -* fix 32000 subdirectory limit +* lift 32000 subdirectory limit imposed by i_links_count[1] * nsec timestamps for mtime, atime, ctime, create time * inode version field on disk (NFSv4, Lustre) * reduced e2fsck time via uninit_bg feature @@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force the ordering) +[1] Filesystems with a block size of 1k may see a limit imposed by the +directory hash tree having a maximum depth of two. + 2.2 Candidate features for future inclusion * Online defrag (patches available but not well tested) From ed5bde0bf8995d7d8c0b5a9c33e624a945f333ef Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Feb 2009 10:48:07 -0500 Subject: [PATCH 013/478] ext4: Simplify delalloc implementation by removing mpd.get_block This parameter was always set to ext4_da_get_block_write(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 125 ++++++++++++++++++++++-------------------------- 1 file changed, 58 insertions(+), 67 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cd0399db0ef1..924ba8afc227 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1705,7 +1705,6 @@ struct mpage_da_data { struct inode *inode; struct buffer_head lbh; /* extent of blocks */ unsigned long first_page, next_page; /* extent of pages */ - get_block_t *get_block; struct writeback_control *wbc; int io_done; int pages_written; @@ -1719,7 +1718,6 @@ struct mpage_da_data { * @mpd->inode: inode * @mpd->first_page: first page of the extent * @mpd->next_page: page after the last page of the extent - * @mpd->get_block: the filesystem's block mapper function * * By the time mpage_da_submit_io() is called we expect all blocks * to be allocated. this may be wrong if allocation failed. @@ -1929,16 +1927,60 @@ static void ext4_print_free_blocks(struct inode *inode) return; } +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + if (ret <= 0) + return ret; + + bh_result->b_size = (ret << inode->i_blkbits); + + if (ext4_should_order_data(inode)) { + int retval; + retval = ext4_jbd2_file_inode(handle, inode); + if (retval) + /* + * Failed to add inode for ordered mode. Don't + * update file size + */ + return retval; + } + + /* + * Update on-disk size along with block allocation we don't + * use 'extend_disksize' as size may change within already + * allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, disksize); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + return 0; +} + /* * mpage_da_map_blocks - go through given space * * @mpd->lbh - bh describing space - * @mpd->get_block - the filesystem's block mapper function * * The function skips space we know is already mapped to disk blocks. * */ -static int mpage_da_map_blocks(struct mpage_da_data *mpd) +static int mpage_da_map_blocks(struct mpage_da_data *mpd) { int err = 0; struct buffer_head new; @@ -1960,30 +2002,29 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) */ if (!new.b_size) return 0; - err = mpd->get_block(mpd->inode, next, &new, 1); - if (err) { - /* If get block returns with error - * we simply return. Later writepage - * will redirty the page and writepages - * will find the dirty page again + err = ext4_da_get_block_write(mpd->inode, next, &new, 1); + if (err) { + /* + * If get block returns with error we simply + * return. Later writepage will redirty the page and + * writepages will find the dirty page again */ if (err == -EAGAIN) return 0; if (err == -ENOSPC && - ext4_count_free_blocks(mpd->inode->i_sb)) { + ext4_count_free_blocks(mpd->inode->i_sb)) { mpd->retval = err; return 0; } /* - * get block failure will cause us - * to loop in writepages. Because - * a_ops->writepage won't be able to - * make progress. The page will be redirtied - * by writepage and writepages will again - * try to write the same. + * get block failure will cause us to loop in + * writepages, because a_ops->writepage won't be able + * to make progress. The page will be redirtied by + * writepage and writepages will again try to write + * the same. */ printk(KERN_EMERG "%s block allocation failed for inode %lu " "at logical offset %llu with max blocks " @@ -2212,7 +2253,6 @@ static int __mpage_da_writepage(struct page *page, * * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @get_block: the filesystem's block mapper function. * * This is a library function, which implements the writepages() * address_space_operation. @@ -2223,9 +2263,6 @@ static int mpage_da_writepages(struct address_space *mapping, { int ret; - if (!mpd->get_block) - return generic_writepages(mapping, wbc); - mpd->lbh.b_size = 0; mpd->lbh.b_state = 0; mpd->lbh.b_blocknr = 0; @@ -2289,51 +2326,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; } -#define EXT4_DELALLOC_RSVED 1 -static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - loff_t disksize = EXT4_I(inode)->i_disksize; - handle_t *handle = NULL; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); - if (ret > 0) { - - bh_result->b_size = (ret << inode->i_blkbits); - - if (ext4_should_order_data(inode)) { - int retval; - retval = ext4_jbd2_file_inode(handle, inode); - if (retval) - /* - * Failed to add inode for ordered - * mode. Don't update file size - */ - return retval; - } - - /* - * Update on-disk size along with block allocation - * we don't use 'extend_disksize' as size may change - * within already allocated block -bzzz - */ - disksize = ((loff_t) iblock + ret) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, disksize); - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } - ret = 0; - } - return ret; -} static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) { @@ -2584,7 +2576,6 @@ retry: dump_stack(); goto out_writepages; } - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); From 8dc207c0e7a259e7122ddfaf56b8bbbc3c92d685 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Feb 2009 06:46:01 -0500 Subject: [PATCH 014/478] ext4: Save stack space by removing fake buffer heads Struct mpage_da_data and mpage_add_bh_to_extent() use a fake struct buffer_head which is 104 bytes on an x86_64 system, but only use 24 bytes of the structure. On systems that use a spinlock for atomic_t, the stack savings will be even greater. It turns out that using a fake struct buffer_head doesn't even save that much code, and it makes the code more confusing since it's not used as a "real" buffer head. So just store pass b_size and b_state in mpage_add_bh_to_extent(), and store b_size, b_state, and b_block_nr in the mpage_da_data structure. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 83 +++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 924ba8afc227..008b28a859d0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1703,7 +1703,9 @@ static void ext4_da_page_release_reservation(struct page *page, struct mpage_da_data { struct inode *inode; - struct buffer_head lbh; /* extent of blocks */ + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ unsigned long first_page, next_page; /* extent of pages */ struct writeback_control *wbc; int io_done; @@ -1737,7 +1739,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->lbh.b_blocknr we would only be looking + * If we look at mpd->b_blocknr we would only be looking * at the currently mapped buffer_heads. */ index = mpd->first_page; @@ -1975,7 +1977,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, /* * mpage_da_map_blocks - go through given space * - * @mpd->lbh - bh describing space + * @mpd - bh describing space * * The function skips space we know is already mapped to disk blocks. * @@ -1984,18 +1986,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) { int err = 0; struct buffer_head new; - struct buffer_head *lbh = &mpd->lbh; sector_t next; /* * We consider only non-mapped and non-allocated blocks */ - if (buffer_mapped(lbh) && !buffer_delay(lbh)) + if ((mpd->b_state & (1 << BH_Mapped)) && + !(mpd->b_state & (1 << BH_Delay))) return 0; - new.b_state = lbh->b_state; + new.b_state = mpd->b_state; new.b_blocknr = 0; - new.b_size = lbh->b_size; - next = lbh->b_blocknr; + new.b_size = mpd->b_size; + next = mpd->b_blocknr; /* * If we didn't accumulate anything * to write simply return @@ -2031,7 +2033,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) "%zd with error %d\n", __func__, mpd->inode->i_ino, (unsigned long long)next, - lbh->b_size >> mpd->inode->i_blkbits, err); + mpd->b_size >> mpd->inode->i_blkbits, err); printk(KERN_EMERG "This should not happen.!! " "Data will be lost\n"); if (err == -ENOSPC) { @@ -2039,7 +2041,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) } /* invlaidate all the pages */ ext4_da_block_invalidatepages(mpd, next, - lbh->b_size >> mpd->inode->i_blkbits); + mpd->b_size >> mpd->inode->i_blkbits); return err; } BUG_ON(new.b_size == 0); @@ -2051,7 +2053,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * If blocks are delayed marked, we need to * put actual blocknr and drop delayed bit */ - if (buffer_delay(lbh) || buffer_unwritten(lbh)) + if ((mpd->b_state & (1 << BH_Delay)) || + (mpd->b_state & (1 << BH_Unwritten))) mpage_put_bnr_to_bhs(mpd, next, &new); return 0; @@ -2070,12 +2073,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * the function is used to collect contig. blocks in same state */ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, struct buffer_head *bh) + sector_t logical, size_t b_size, + unsigned long b_state) { sector_t next; - size_t b_size = bh->b_size; - struct buffer_head *lbh = &mpd->lbh; - int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; + int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; /* check if thereserved journal credits might overflow */ if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { @@ -2102,19 +2104,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, /* * First block in the extent */ - if (lbh->b_size == 0) { - lbh->b_blocknr = logical; - lbh->b_size = b_size; - lbh->b_state = bh->b_state & BH_FLAGS; + if (mpd->b_size == 0) { + mpd->b_blocknr = logical; + mpd->b_size = b_size; + mpd->b_state = b_state & BH_FLAGS; return; } - next = lbh->b_blocknr + nrblocks; + next = mpd->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ - if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += b_size; + if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { + mpd->b_size += b_size; return; } @@ -2143,7 +2145,7 @@ static int __mpage_da_writepage(struct page *page, { struct mpage_da_data *mpd = data; struct inode *inode = mpd->inode; - struct buffer_head *bh, *head, fake; + struct buffer_head *bh, *head; sector_t logical; if (mpd->io_done) { @@ -2185,9 +2187,9 @@ static int __mpage_da_writepage(struct page *page, /* * ... and blocks */ - mpd->lbh.b_size = 0; - mpd->lbh.b_state = 0; - mpd->lbh.b_blocknr = 0; + mpd->b_size = 0; + mpd->b_state = 0; + mpd->b_blocknr = 0; } mpd->next_page = page->index + 1; @@ -2195,16 +2197,8 @@ static int __mpage_da_writepage(struct page *page, (PAGE_CACHE_SHIFT - inode->i_blkbits); if (!page_has_buffers(page)) { - /* - * There is no attached buffer heads yet (mmap?) - * we treat the page asfull of dirty blocks - */ - bh = &fake; - bh->b_size = PAGE_CACHE_SIZE; - bh->b_state = 0; - set_buffer_dirty(bh); - set_buffer_uptodate(bh); - mpage_add_bh_to_extent(mpd, logical, bh); + mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, + (1 << BH_Dirty) | (1 << BH_Uptodate)); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; } else { @@ -2222,8 +2216,10 @@ static int __mpage_da_writepage(struct page *page, * with the page in ext4_da_writepage */ if (buffer_dirty(bh) && - (!buffer_mapped(bh) || buffer_delay(bh))) { - mpage_add_bh_to_extent(mpd, logical, bh); + (!buffer_mapped(bh) || buffer_delay(bh))) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_size, + bh->b_state); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { @@ -2235,9 +2231,8 @@ static int __mpage_da_writepage(struct page *page, * unmapped buffer_head later we need to * use the b_state flag of that buffer_head. */ - if (mpd->lbh.b_size == 0) - mpd->lbh.b_state = - bh->b_state & BH_FLAGS; + if (mpd->b_size == 0) + mpd->b_state = bh->b_state & BH_FLAGS; } logical++; } while ((bh = bh->b_this_page) != head); @@ -2263,9 +2258,9 @@ static int mpage_da_writepages(struct address_space *mapping, { int ret; - mpd->lbh.b_size = 0; - mpd->lbh.b_state = 0; - mpd->lbh.b_blocknr = 0; + mpd->b_size = 0; + mpd->b_state = 0; + mpd->b_blocknr = 0; mpd->first_page = 0; mpd->next_page = 0; mpd->io_done = 0; From f63e6005bc63acc0a6bc3bdb8f971dcfbd827185 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Feb 2009 16:42:39 -0500 Subject: [PATCH 015/478] ext4: Simplify delalloc code by removing mpage_da_writepages() The mpage_da_writepages() function is only used in one place, so inline it to simplify the call stack and make the code easier to understand. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 74 +++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 008b28a859d0..24d0f9d2b320 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2241,47 +2241,6 @@ static int __mpage_da_writepage(struct page *page, return 0; } -/* - * mpage_da_writepages - walk the list of dirty pages of the given - * address space, allocates non-allocated blocks, maps newly-allocated - * blocks to existing bhs and issue IO them - * - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * - * This is a library function, which implements the writepages() - * address_space_operation. - */ -static int mpage_da_writepages(struct address_space *mapping, - struct writeback_control *wbc, - struct mpage_da_data *mpd) -{ - int ret; - - mpd->b_size = 0; - mpd->b_state = 0; - mpd->b_blocknr = 0; - mpd->first_page = 0; - mpd->next_page = 0; - mpd->io_done = 0; - mpd->pages_written = 0; - mpd->retval = 0; - - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* - * Handle last extent of pages - */ - if (!mpd->io_done && mpd->next_page != mpd->first_page) { - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); - - mpd->io_done = 1; - ret = MPAGE_DA_EXTENT_TAIL; - } - wbc->nr_to_write -= mpd->pages_written; - return ret; -} - /* * this is a special callback for ->write_begin() only * it's intention is to return mapped block or reserve space @@ -2571,7 +2530,38 @@ retry: dump_stack(); goto out_writepages; } - ret = mpage_da_writepages(mapping, wbc, &mpd); + + /* + * Now call __mpage_da_writepage to find the next + * contiguous region of logical blocks that need + * blocks to be allocated by ext4. We don't actually + * submit the blocks for I/O here, even though + * write_cache_pages thinks it will, and will set the + * pages as clean for write before calling + * __mpage_da_writepage(). + */ + mpd.b_size = 0; + mpd.b_state = 0; + mpd.b_blocknr = 0; + mpd.first_page = 0; + mpd.next_page = 0; + mpd.io_done = 0; + mpd.pages_written = 0; + mpd.retval = 0; + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, + &mpd); + /* + * If we have a contigous extent of pages and we + * haven't done the I/O yet, map the blocks and submit + * them for I/O. + */ + if (!mpd.io_done && mpd.next_page != mpd.first_page) { + if (mpage_da_map_blocks(&mpd) == 0) + mpage_da_submit_io(&mpd); + mpd.io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd.pages_written; ext4_journal_stop(handle); From ccd2506bd43113659aa904d5bea5d1300605e2a6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 26 Feb 2009 01:04:07 -0500 Subject: [PATCH 016/478] ext4: add EXT4_IOC_ALLOC_DA_BLKS ioctl Add an ioctl which forces all of the delay allocated blocks to be allocated. This also provides a function ext4_alloc_da_blocks() which will be used by the following commits to force files to be fully allocated to preserve application-expected ext3 behaviour. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ fs/ext4/inode.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ioctl.c | 14 ++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 096456c8559b..b0ea70cc94db 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -317,7 +317,9 @@ struct ext4_new_group_data { #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) #define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) /* * ioctl commands in 32 bit emulation @@ -1094,6 +1096,7 @@ extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 24d0f9d2b320..8dd3d5de5861 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2837,6 +2837,48 @@ out: return; } +/* + * Force all delayed allocation blocks to be allocated for a given inode. + */ +int ext4_alloc_da_blocks(struct inode *inode) +{ + if (!EXT4_I(inode)->i_reserved_data_blocks && + !EXT4_I(inode)->i_reserved_meta_blocks) + return 0; + + /* + * We do something simple for now. The filemap_flush() will + * also start triggering a write of the data blocks, which is + * not strictly speaking necessary (and for users of + * laptop_mode, not even desirable). However, to do otherwise + * would require replicating code paths in: + * + * ext4_da_writepages() -> + * write_cache_pages() ---> (via passed in callback function) + * __mpage_da_writepage() --> + * mpage_add_bh_to_extent() + * mpage_da_map_blocks() + * + * The problem is that write_cache_pages(), located in + * mm/page-writeback.c, marks pages clean in preparation for + * doing I/O, which is not desirable if we're not planning on + * doing I/O at all. + * + * We could call write_cache_pages(), and then redirty all of + * the pages by calling redirty_page_for_writeback() but that + * would be ugly in the extreme. So instead we would need to + * replicate parts of the code in the above functions, + * simplifying them becuase we wouldn't actually intend to + * write out the pages, but rather only collect contiguous + * logical block extents, call the multi-block allocator, and + * then update the buffer heads with the block allocations. + * + * For now, though, we'll cheat by calling filemap_flush(), + * which will map the blocks, and start the I/O, but not + * actually wait for the I/O to complete. + */ + return filemap_flush(inode->i_mapping); +} /* * bmap() is special. It gets used by applications such as lilo and by diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 22dd29f3ebc9..91e75f7a9e73 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -262,6 +262,20 @@ setversion_out: return err; } + case EXT4_IOC_ALLOC_DA_BLKS: + { + int err; + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + err = ext4_alloc_da_blocks(inode); + mnt_drop_write(filp->f_path.mnt); + return err; + } + default: return -ENOTTY; } From 7d8f9f7d150dded7b68e61ca6403a1f166fb4edf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 24 Feb 2009 08:21:14 -0500 Subject: [PATCH 017/478] ext4: Automatically allocate delay allocated blocks on close When closing a file that had been previously truncated, force any delay allocated blocks that to be allocated so that if the filesystem is mounted with data=ordered, the data blocks will be pushed out to disk along with the journal commit. Many application programs expect this, so we do this to avoid zero length files if the system crashes unexpectedly. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/file.c | 4 ++++ fs/ext4/inode.c | 3 +++ 3 files changed, 8 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0ea70cc94db..1b0c17364631 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -270,6 +270,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ +#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f731cb545a03..06df8272c639 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -33,6 +33,10 @@ */ static int ext4_release_file(struct inode *inode, struct file *filp) { + if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { + ext4_alloc_da_blocks(inode); + EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; + } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8dd3d5de5861..80ed6dc9c9d2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3901,6 +3901,9 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; + if (inode->i_size == 0) + ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ext4_ext_truncate(inode); return; From 8750c6d5fcbd3342b3d908d157f81d345c5325a7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Feb 2009 23:05:27 -0500 Subject: [PATCH 018/478] ext4: Automatically allocate delay allocated blocks on rename When renaming a file such that a link to another inode is overwritten, force any delay allocated blocks that to be allocated so that if the filesystem is mounted with data=ordered, the data blocks will be pushed out to disk along with the journal commit. Many application programs expect this, so we do this to avoid zero length files if the system crashes unexpectedly. Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6e1ad68cdc7a..eb20246c8965 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2357,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; - int retval; + int retval, force_da_alloc = 0; old_bh = new_bh = dir_bh = NULL; @@ -2497,6 +2497,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext4_orphan_add(handle, new_inode); + force_da_alloc = 1; } retval = 0; @@ -2505,6 +2506,8 @@ end_rename: brelse(old_bh); brelse(new_bh); ext4_journal_stop(handle); + if (retval == 0 && force_da_alloc) + ext4_alloc_da_blocks(old_inode); return retval; } From d6014301b5599fba395c42a1e96a7fe86f7d0b2d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 27 Mar 2009 22:36:43 -0400 Subject: [PATCH 019/478] ext4: Fix discard of inode prealloc space with delayed allocation. With delayed allocation we should not/cannot discard inode prealloc space during file close. We would still have dirty pages for which we haven't allocated blocks yet. With this fix after each get_blocks request we check whether we have zero reserved blocks and if yes and we don't have any writers on the file we discard inode prealloc space. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 3 ++- fs/ext4/inode.c | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 06df8272c639..588af8c77246 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -39,7 +39,8 @@ static int ext4_release_file(struct inode *inode, struct file *filp) } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 80ed6dc9c9d2..7dcac9d7e491 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1067,9 +1067,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) /* * free those over-booking quota for metadata blocks */ - if (mdb_free) vfs_dq_release_reservation_block(inode, mdb_free); + + /* + * If we have done all the pending block allocations and if + * there aren't any writers on the inode, we can discard the + * inode's preallocations. + */ + if (!total && (atomic_read(&inode->i_writecount) == 0)) + ext4_discard_preallocations(inode); } /* From afc32f7ee9febc020c73da61402351d4c90437f3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 28 Feb 2009 19:39:58 -0500 Subject: [PATCH 020/478] ext4: Track lifetime disk writes Add a new superblock value which tracks the lifetime amount of writes to the filesystem. This is useful in estimating the amount of wear on solid state drives (SSD's) caused by writes to the filesystem. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- fs/ext4/ext4_sb.h | 4 ++++ fs/ext4/super.c | 7 +++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1b0c17364631..0bd39188531c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -683,7 +683,8 @@ struct ext4_super_block { __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_reserved_char_pad2; __le16 s_reserved_pad; - __u32 s_reserved[162]; /* Padding to the end of the block */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __u32 s_reserved[160]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 4e4d9cc3f40d..50ab1169c378 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -142,6 +142,10 @@ struct ext4_sb_info { /* locality groups */ struct ext4_locality_group *s_locality_groups; + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a3768709ce05..30fc27cdf8fc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2035,6 +2035,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resgid = EXT4_DEF_RESGID; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; + sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, + sectors[1]); unlock_kernel(); @@ -2072,6 +2074,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) goto cantfind_ext4; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -2921,6 +2924,10 @@ static int ext4_commit_super(struct super_block *sb, set_buffer_uptodate(sbh); } es->s_wtime = cpu_to_le32(get_seconds()); + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( From 3197ebdb130473a92760100cbfe0d7e671838f48 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 31 Mar 2009 09:10:09 -0400 Subject: [PATCH 021/478] ext4: Add sysfs support Add basic sysfs support so that information about the mounted filesystem and various tuning parameters can be accessed via /sys/fs/ext4//*. Signed-off-by: "Theodore Ts'o" --- Documentation/ABI/testing/sysfs-fs-ext4 | 81 ++++++++++ fs/ext4/ext4_sb.h | 2 + fs/ext4/super.c | 207 ++++++++++++++++++++++++ 3 files changed, 290 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-fs-ext4 diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4 new file mode 100644 index 000000000000..4e79074de282 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-ext4 @@ -0,0 +1,81 @@ +What: /sys/fs/ext4//mb_stats +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Controls whether the multiblock allocator should + collect statistics, which are shown during the unmount. + 1 means to collect statistics, 0 means not to collect + statistics + +What: /sys/fs/ext4//mb_group_prealloc +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + The multiblock allocator will round up allocation + requests to a multiple of this tuning parameter if the + stripe size is not set in the ext4 superblock + +What: /sys/fs/ext4//mb_max_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + The maximum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4//mb_min_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + The minimum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4//mb_order2_req +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Tuning parameter which controls the minimum size for + requests (as a power of 2) where the buddy cache is + used + +What: /sys/fs/ext4//mb_stream_req +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Files which have fewer blocks than this tunable + parameter will have their blocks allocated out of a + block group specific preallocation pool, so that small + files are packed closely together. Each large file + will have its blocks allocated out of its own unique + preallocation pool. + +What: /sys/fs/ext4//inode_readahead +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Tuning parameter which controls the maximum number of + inode table blocks that ext4's inode table readahead + algorithm will pre-read into the buffer cache + +What: /sys/fs/ext4//delayed_allocation_blocks +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + This file is read-only and shows the number of blocks + that are dirty in the page cache, but which do not + have their location in the filesystem allocated yet. + +What: /sys/fs/ext4//lifetime_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + This file is read-only and shows the number of kilobytes + of data that have been written to this filesystem since it was + created. + +What: /sys/fs/ext4//session_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + This file is read-only and shows the number of + kilobytes of data that have been written to this + filesystem since it was mounted. diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 50ab1169c378..57b71fefbccf 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -64,6 +64,8 @@ struct ext4_sb_info { struct percpu_counter s_dirtyblocks_counter; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; /* Journaling */ struct inode *s_journal_inode; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 30fc27cdf8fc..2883d4318c22 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ #include "group.h" struct proc_dir_entry *ext4_proc_root; +static struct kset *ext4_kset; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -580,6 +582,7 @@ static void ext4_put_super(struct super_block *sb) remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } + kobject_del(&sbi->s_kobj); for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); @@ -615,6 +618,16 @@ static void ext4_put_super(struct super_block *sb) ext4_blkdev_remove(sbi); } sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the + * superblock, we need to actually destroy the kobject. + */ + unlock_kernel(); + unlock_super(sb); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + lock_super(sb); + lock_kernel(); kfree(sbi->s_blockgroup_lock); kfree(sbi); return; @@ -1464,6 +1477,11 @@ set_qf_format: return 0; if (option < 0 || option > (1 << 30)) return 0; + if (option & (option - 1)) { + printk(KERN_ERR "EXT4-fs: inode_readahead_blks" + " must be a power of 2\n"); + return 0; + } sbi->s_inode_readahead_blks = option; break; case Opt_journal_ioprio: @@ -1992,6 +2010,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return 0; } +/* sysfs supprt */ + +struct ext4_attr { + struct attribute attr; + ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + const char *, size_t); + int offset; +}; + +static int parse_strtoul(const char *buf, + unsigned long max, unsigned long *value) +{ + char *endp; + + while (*buf && isspace(*buf)) + buf++; + *value = simple_strtoul(buf, &endp, 0); + while (*endp && isspace(*endp)) + endp++; + if (*endp || *value > max) + return -EINVAL; + + return 0; +} + +static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); +} + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return snprintf(buf, PAGE_SIZE, "%llu\n", + sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + + if (parse_strtoul(buf, 0x40000000, &t)) + return -EINVAL; + + /* inode_readahead_blks must be a power of 2 */ + if (t & (t-1)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t sbi_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t sbi_ui_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned long t; + + if (parse_strtoul(buf, 0xffffffff, &t)) + return -EINVAL; + *ui = t; + return count; +} + +#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .offset = offsetof(struct ext4_sb_info, _elname), \ +} +#define EXT4_ATTR(name, mode, show, store) \ +static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) + +#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) +#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) +#define EXT4_RW_ATTR_SBI_UI(name, elname) \ + EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_RO_ATTR(delayed_allocation_blocks); +EXT4_RO_ATTR(session_write_kbytes); +EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, + inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + NULL, +}; + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + + +static struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2575,6 +2768,16 @@ no_journal: goto failed_mount4; } + sbi->s_kobj.kset = ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, + "%s", sb->s_id); + if (err) { + ext4_mb_release(sb); + ext4_ext_release(sb); + goto failed_mount4; + }; + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -3734,6 +3937,9 @@ static int __init init_ext4_fs(void) { int err; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) + return -ENOMEM; ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) @@ -3775,6 +3981,7 @@ static void __exit exit_ext4_fs(void) exit_ext4_xattr(); exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); From b713a5ec55bf73c833f9883cdd761b20ee61a1ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 31 Mar 2009 09:11:14 -0400 Subject: [PATCH 022/478] ext4: remove /proc tuning knobs Remove tuning knobs in /proc/fs/ext4//*. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/proc.txt | 21 ------ fs/ext4/ext4.h | 16 ---- fs/ext4/inode.c | 7 +- fs/ext4/mballoc.c | 117 +++++++---------------------- fs/ext4/super.c | 46 ------------ 5 files changed, 30 insertions(+), 177 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 830bad7cce0f..efc4fd9f40ce 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/ File Content mb_groups details of multiblock allocator buddy cache of free blocks mb_history multiblock allocation history - stats controls whether the multiblock allocator should start - collecting statistics, which are shown during the unmount - group_prealloc the multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if the - stripe size is not set in the ext4 superblock - max_to_scan The maximum number of extents the multiblock allocator - will search to find the best extent - min_to_scan The minimum number of extents the multiblock allocator - will search to find the best extent - order2_req Tuning parameter which controls the minimum size for - requests (as a power of 2) where the buddy cache is - used - stream_req Files which have fewer blocks than this tunable - parameter will have their blocks allocated out of a - block group specific preallocation pool, so that small - files are packed closely together. Each large file - will have its blocks allocated out of its own unique - preallocation pool. -inode_readahead Tuning parameter which controls the maximum number of - inode table blocks that ext4's inode table readahead - algorithm will pre-read into the buffer cache .............................................................................. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0bd39188531c..e5c273ff928b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -976,22 +976,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, extern struct proc_dir_entry *ext4_proc_root; -#ifdef CONFIG_PROC_FS -extern const struct file_operations ext4_ui_proc_fops; - -#define EXT4_PROC_HANDLER(name, var) \ -do { \ - proc = proc_create_data(name, mode, sbi->s_proc, \ - &ext4_ui_proc_fops, &sbi->s_##var); \ - if (proc == NULL) { \ - printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ - goto err_out; \ - } \ -} while (0) -#else -#define EXT4_PROC_HANDLER(name, var) -#endif - /* * Function prototypes */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7dcac9d7e491..d3118d1acc39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4153,12 +4153,7 @@ make_io: unsigned num; table = ext4_inode_table(sb, gdp); - /* Make sure s_inode_readahead_blks is a power of 2 */ - while (EXT4_SB(sb)->s_inode_readahead_blks & - (EXT4_SB(sb)->s_inode_readahead_blks-1)) - EXT4_SB(sb)->s_inode_readahead_blks = - (EXT4_SB(sb)->s_inode_readahead_blks & - (EXT4_SB(sb)->s_inode_readahead_blks-1)); + /* s_inode_readahead_blks is always a power of 2 */ b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); if (table > b) b = table; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b0d6022eaa67..c4c430977622 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -46,22 +46,23 @@ * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * - * During initialization phase of the allocator we decide to use the group - * preallocation or inode preallocation depending on the size file. The - * size of the file could be the resulting file size we would have after - * allocation or the current file size which ever is larger. If the size is - * less that sbi->s_mb_stream_request we select the group - * preallocation. The default value of s_mb_stream_request is 16 - * blocks. This can also be tuned via - * /proc/fs/ext4//stream_req. The value is represented in terms - * of number of blocks. + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4//mb_stream_req. The value is represented in + * terms of number of blocks. * * The main motivation for having small file use group preallocation is to - * ensure that we have small file closer in the disk. + * ensure that we have small files closer together on the disk. * - * First stage the allocator looks at the inode prealloc list - * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for - * this particular inode. The inode prealloc space is represented as: + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space @@ -121,29 +122,29 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to + * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is * 512 blocks. This can be tuned via - * /proc/fs/ext4/ option the group prealloc request is normalized to the * stripe value (sbi->s_stripe) * - * The regular allocator(using the buddy cache) support few tunables. + * The regular allocator(using the buddy cache) supports few tunables. * - * /proc/fs/ext4//min_to_scan - * /proc/fs/ext4//max_to_scan - * /proc/fs/ext4//order2_req + * /sys/fs/ext4//mb_min_to_scan + * /sys/fs/ext4//mb_max_to_scan + * /sys/fs/ext4//mb_order2_req * - * The regular allocator use buddy scan only if the request len is power of + * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via - * /proc/fs/ext4//order2_req. If the request len is equal to + * /sys/fs/ext4//mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contigous block in - * stripe size. This should result in better allocation on RAID setup. If - * not we search in the specific group using bitmap for best extents. The - * tunable min_to_scan and max_to_scan controll the behaviour here. + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best - * extent and max_to_scanindicate how long the mballoc __can__ look for a + * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it @@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); @@ -1978,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs - * You can tune it via /proc/fs/ext4//order2_req + * You can tune it via /sys/fs/ext4//mb_order2_req */ if (i >= sbi->s_mb_order2_reqs) { /* @@ -2753,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } - ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); if (sbi->s_journal) @@ -2836,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb) free_percpu(sbi->s_locality_groups); ext4_mb_history_release(sb); - ext4_mb_destroy_per_dev_proc(sb); return 0; } @@ -2897,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug("freed %u blocks in %u structures\n", count, count2); } -#define EXT4_MB_STATS_NAME "stats" -#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" -#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" -#define EXT4_MB_ORDER2_REQ "order2_req" -#define EXT4_MB_STREAM_REQ "stream_req" -#define EXT4_MB_GROUP_PREALLOC "group_prealloc" - -static int ext4_mb_init_per_dev_proc(struct super_block *sb) -{ -#ifdef CONFIG_PROC_FS - mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct proc_dir_entry *proc; - - if (sbi->s_proc == NULL) - return -EINVAL; - - EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); - EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); - EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); - EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); - EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); - EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); - return 0; - -err_out: - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - return -ENOMEM; -#else - return 0; -#endif -} - -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) -{ -#ifdef CONFIG_PROC_FS - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_proc == NULL) - return -EINVAL; - - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); -#endif - return 0; -} - int __init init_ext4_mballoc(void) { ext4_pspace_cachep = @@ -3123,7 +3064,7 @@ out_err: * here we normalize request for locality group * Group request are normalized to s_strip size if we set the same via mount * option. If not we set it to s_mb_group_prealloc which can be configured via - * /proc/fs/ext4//group_prealloc + * /sys/fs/ext4//mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ @@ -4239,7 +4180,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) * file is determined by the current size or the resulting size after * allocation which ever is larger * - * One can tune this size via /proc/fs/ext4//stream_req + * One can tune this size via /sys/fs/ext4//mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2883d4318c22..1ec554cc107a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -579,7 +579,6 @@ static void ext4_put_super(struct super_block *sb) ext4_commit_super(sb, es, 1); } if (sbi->s_proc) { - remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } kobject_del(&sbi->s_kobj); @@ -2529,11 +2528,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_PROC_FS if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - - if (sbi->s_proc) - proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, - &ext4_ui_proc_fops, - &sbi->s_inode_readahead_blks); #endif bgl_lock_init(sbi->s_blockgroup_lock); @@ -2832,7 +2826,6 @@ failed_mount2: kfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { - remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -3865,45 +3858,6 @@ static int ext4_get_sb(struct file_system_type *fs_type, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); } -#ifdef CONFIG_PROC_FS -static int ext4_ui_proc_show(struct seq_file *m, void *v) -{ - unsigned int *p = m->private; - - seq_printf(m, "%u\n", *p); - return 0; -} - -static int ext4_ui_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, ext4_ui_proc_show, PDE(inode)->data); -} - -static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; - char str[32]; - - if (cnt >= sizeof(str)) - return -EINVAL; - if (copy_from_user(str, buf, cnt)) - return -EFAULT; - - *p = simple_strtoul(str, NULL, 0); - return cnt; -} - -const struct file_operations ext4_ui_proc_fops = { - .owner = THIS_MODULE, - .open = ext4_ui_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = ext4_ui_proc_write, -}; -#endif - static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", From 9f24e4208f7ee2748f157368b63287dc903fcf60 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Mar 2009 19:09:10 -0500 Subject: [PATCH 023/478] ext4: Use atomic_t's in struct flex_groups Reduce pressure on the sb_bgl_lock family of locks by using atomic_t's to track the number of free blocks and inodes in each flex_group. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 5 ++--- fs/ext4/ext4.h | 4 ++-- fs/ext4/ialloc.c | 33 +++++++++++++++------------------ fs/ext4/mballoc.c | 9 +++------ fs/ext4/resize.c | 8 ++++---- fs/ext4/super.c | 8 ++++---- 6 files changed, 30 insertions(+), 37 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index b37b12875582..53c72ad85877 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -470,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_add(blocks_freed, + &sbi->s_flex_groups[flex_group].free_blocks); } /* * request to reload the buddy with the diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e5c273ff928b..e52b48f86ed4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -170,8 +170,8 @@ struct ext4_group_desc */ struct flex_groups { - __u32 free_inodes; - __u32 free_blocks; + atomic_t free_inodes; + atomic_t free_blocks; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 617f5a2d800a..5f393927fd25 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; - ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", @@ -277,10 +276,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) percpu_counter_dec(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { - flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_inodes++; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + ext4_group_t f; + + f = ext4_flex_group(sbi, block_group); + atomic_inc(&sbi->s_flex_groups[f].free_inodes); } } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); @@ -360,9 +359,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, sbi->s_log_groups_per_flex; find_close_to_parent: - flexbg_free_blocks = flex_group[best_flex].free_blocks; + flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - if (flex_group[best_flex].free_inodes && + if (atomic_read(&flex_group[best_flex].free_inodes) && flex_freeb_ratio > free_block_ratio) goto found_flexbg; @@ -375,24 +374,24 @@ find_close_to_parent: if (i == parent_fbg_group || i == parent_fbg_group - 1) continue; - flexbg_free_blocks = flex_group[i].free_blocks; + flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; if (flex_freeb_ratio > free_block_ratio && - flex_group[i].free_inodes) { + (atomic_read(&flex_group[i].free_inodes))) { best_flex = i; goto found_flexbg; } - if (flex_group[best_flex].free_inodes == 0 || - (flex_group[i].free_blocks > - flex_group[best_flex].free_blocks && - flex_group[i].free_inodes)) + if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || + ((atomic_read(&flex_group[i].free_blocks) > + atomic_read(&flex_group[best_flex].free_blocks)) && + atomic_read(&flex_group[i].free_inodes))) best_flex = i; } - if (!flex_group[best_flex].free_inodes || - !flex_group[best_flex].free_blocks) + if (!atomic_read(&flex_group[best_flex].free_inodes) || + !atomic_read(&flex_group[best_flex].free_blocks)) return -1; found_flexbg: @@ -960,9 +959,7 @@ got: if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_inodes--; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } inode->i_uid = current_fsuid(); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c4c430977622..e72c72a0b807 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3044,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_blocks); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4884,9 +4883,7 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += count; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); } ext4_mb_release_desc(&e4b); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c06886abd658..546c7dd869e1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); - sbi->s_flex_groups[flex_group].free_blocks += - input->free_blocks_count; - sbi->s_flex_groups[flex_group].free_inodes += - EXT4_INODES_PER_GROUP(sb); + atomic_add(input->free_blocks_count, + &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_INODES_PER_GROUP(sb), + &sbi->s_flex_groups[flex_group].free_inodes); } ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1ec554cc107a..6b5d5c6399fa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1630,10 +1630,10 @@ static int ext4_fill_flex_info(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, &bh); flex_group = ext4_flex_group(sbi, i); - sbi->s_flex_groups[flex_group].free_inodes += - ext4_free_inodes_count(sb, gdp); - sbi->s_flex_groups[flex_group].free_blocks += - ext4_free_blks_count(sb, gdp); + atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, + ext4_free_inodes_count(sb, gdp)); + atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, + ext4_free_blks_count(sb, gdp)); } return 1; From 7d39db14a42cbd719c7515b9da8f85a2eb6a0633 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Mar 2009 19:31:53 -0500 Subject: [PATCH 024/478] ext4: Use struct flex_groups to calculate get_orlov_stats() Instead of looping over all of the block groups in a flex group summing their summary statistics, start tracking used_dirs in struct flex_groups, and use struct flex_groups instead. This should save a bit of CPU for mkdir-heavy workloads. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/ialloc.c | 45 ++++++++++++++++++++++++++++----------------- fs/ext4/super.c | 2 ++ 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e52b48f86ed4..46aaaa2ed4c5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -172,6 +172,7 @@ struct ext4_group_desc struct flex_groups { atomic_t free_inodes; atomic_t free_blocks; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 5f393927fd25..47b84e8df568 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -267,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f; + + f = ext4_flex_group(sbi, block_group); + atomic_dec(&sbi->s_flex_groups[f].free_inodes); + } + } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); @@ -424,25 +431,24 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g, int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; - int i; + struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; - stats->free_inodes = 0; - stats->free_blocks = 0; - stats->used_dirs = 0; + if (flex_size > 1) { + stats->free_inodes = atomic_read(&flex_group[g].free_inodes); + stats->free_blocks = atomic_read(&flex_group[g].free_blocks); + stats->used_dirs = atomic_read(&flex_group[g].used_dirs); + return; + } - g *= flex_size; - - for (i = 0; i < flex_size; i++) { - if (g >= ngroups) - break; - desc = ext4_get_group_desc(sb, g++, NULL); - if (!desc) - continue; - - stats->free_inodes += ext4_free_inodes_count(sb, desc); - stats->free_blocks += ext4_free_blks_count(sb, desc); - stats->used_dirs += ext4_used_dirs_count(sb, desc); + desc = ext4_get_group_desc(sb, g, NULL); + if (desc) { + stats->free_inodes = ext4_free_inodes_count(sb, desc); + stats->free_blocks = ext4_free_blks_count(sb, desc); + stats->used_dirs = ext4_used_dirs_count(sb, desc); + } else { + stats->free_inodes = 0; + stats->free_blocks = 0; + stats->used_dirs = 0; } } @@ -765,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb, if (S_ISDIR(mode)) { count = ext4_used_dirs_count(sb, gdp) + 1; ext4_used_dirs_set(sb, gdp, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].free_inodes); + } } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6b5d5c6399fa..b9aefceb41e7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1634,6 +1634,8 @@ static int ext4_fill_flex_info(struct super_block *sb) ext4_free_inodes_count(sb, gdp)); atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, ext4_free_blks_count(sb, gdp)); + atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, + ext4_used_dirs_count(sb, gdp)); } return 1; From fe315e76fc3a3f9f7e1581dc22fec7e7719f0896 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2009 14:10:21 -0400 Subject: [PATCH 025/478] SUNRPC: Avoid spurious wake-up during UDP connect processing To clear out old state, the UDP connect workers unconditionally invoke xs_close() before proceeding with a new connect. Nowadays this causes a spurious wake-up of the task waiting for the connect to complete. This is a little racey, but usually harmless. The waiting task immediately retries the connect via a call_bind/call_connect sequence, which usually finds the transport already in the connected state because the connect worker has finished in the background. To avoid a spurious wake-up, factor the xs_close() logic that resets the underlying socket into a helper, and have the UDP connect workers call that helper instead of xs_close(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 44 +++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 29c71e645b27..1127eb934136 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -767,23 +767,13 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_error_report = transport->old_error_report; } -/** - * xs_close - close a socket - * @xprt: transport - * - * This is used when all requests are complete; ie, no DRC state remains - * on the server we want to save. - */ -static void xs_close(struct rpc_xprt *xprt) +static void xs_reset_transport(struct sock_xprt *transport) { - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct socket *sock = transport->sock; struct sock *sk = transport->inet; - if (!sk) - goto clear_close_wait; - - dprintk("RPC: xs_close xprt %p\n", xprt); + if (sk == NULL) + return; write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; @@ -797,7 +787,23 @@ static void xs_close(struct rpc_xprt *xprt) sk->sk_no_check = 0; sock_release(sock); -clear_close_wait: +} + +/** + * xs_close - close a socket + * @xprt: transport + * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. + */ +static void xs_close(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + dprintk("RPC: xs_close xprt %p\n", xprt); + + xs_reset_transport(transport); + smp_mb__before_clear_bit(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); @@ -1537,9 +1543,10 @@ static void xs_udp_connect_worker4(struct work_struct *work) goto out; /* Start by resetting any existing state */ - xs_close(xprt); + xs_reset_transport(transport); - if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (err < 0) { dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } @@ -1578,9 +1585,10 @@ static void xs_udp_connect_worker6(struct work_struct *work) goto out; /* Start by resetting any existing state */ - xs_close(xprt); + xs_reset_transport(transport); - if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (err < 0) { dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } From 2b57dc6cf9bf31edc0df430ea18dd1dbd3028975 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Wed, 11 Mar 2009 14:10:22 -0400 Subject: [PATCH 026/478] NFS: Minor __nfs_revalidate_inode cleanup Remove redundant NFS_STALE() check, a leftover due to the commit 691beb13cdc88358334ef0ba867c080a247a760f Signed-off-by: Suresh Jayaraman Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0c381686171e..acaaa7c7efa4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -670,9 +670,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; - if (NFS_STALE(inode)) - goto out; - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { From 37d9d76d8b3a2ac5817e1fa3263cfe0fdb439e51 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 11 Mar 2009 14:10:23 -0400 Subject: [PATCH 027/478] NFS: flush cached directory information slightly more readily. If cached directory contents becomes incorrect, there is no way to flush the contents. This contrasts with files where file locking is the recommended way to ensure cache consistency between multiple applications (a read-lock always flushes the cache). Also while changes to files often change the size of the file (thus triggering a cache flush), changes to directories often do not change the apparent size (as the size is often rounded to a block size). So it is particularly important with directories to avoid the possibility of an incorrect cache wherever possible. When the link count on a directory changes it implies a change in the number of child directories, and so a change in the contents of this directory. So use that as a trigger to flush cached contents. When the ctime changes but the mtime does not, there are two possible reasons. 1/ The owner/mode information has been changed. 2/ utimes has been used to set the mtime backwards. In the first case, a data-cache flush is not required. In the second case it is. So on the basis that correctness trumps performance, flush the directory contents cache in this case also. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index acaaa7c7efa4..268ce3a46220 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1113,8 +1113,16 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); } /* If ctime has changed we should definitely clear access+acl caches */ - if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) + if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + /* and probably clear data for a directory too as utimes can cause + * havoc with our cache. + */ + if (S_ISDIR(inode->i_mode)) { + invalid |= NFS_INO_INVALID_DATA; + nfs_force_lookup_revalidate(inode); + } + } } else if (nfsi->change_attr != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); @@ -1148,8 +1156,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - if (inode->i_nlink != fattr->nlink) + if (inode->i_nlink != fattr->nlink) { invalid |= NFS_INO_INVALID_ATTR; + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + } inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; From 78f945f88ef83dcc7c962614a080e0a9a2db5889 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:23 -0400 Subject: [PATCH 028/478] NFSv4: Ignore errors on the post-op attributes in SETATTR calls There is no need to fail or retry a SETATTR call just because the post-op GETATTR failed. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d1e4c8f8a0a9..5f0ee3e2bd84 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4078,9 +4078,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se status = decode_setattr(&xdr, res); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); - if (status == NFS4ERR_DELAY) - status = 0; + decode_getfattr(&xdr, res->fattr, res->server); out: return status; } From 9e6e70f8d8b6698e0017c56b86525aabe9c7cd4c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:24 -0400 Subject: [PATCH 029/478] NFSv4: Support NFSv4 optional attributes in the struct nfs_fattr Currently, filling struct nfs_fattr is more or less an all or nothing operation, since NFSv2 and NFSv3 have only mandatory attributes. In NFSv4, some attributes are optional, and so we may simply not be able to fill in those fields. Furthermore, NFSv4 allows you to specify which attributes you are interested in retrieving, thus permitting you to optimise away retrieval of attributes that you know will no change... Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 243 +++++++++++++++++++++++++--------------- fs/nfs/nfs2xdr.c | 2 +- fs/nfs/nfs3xdr.c | 6 +- fs/nfs/nfs4xdr.c | 6 +- include/linux/nfs_xdr.h | 48 ++++++-- 5 files changed, 202 insertions(+), 103 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 268ce3a46220..b7656bd3706f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -249,13 +249,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; - if ((fattr->valid & NFS_ATTR_FATTR) == 0) + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) goto out_no_inode; - - if (!fattr->nlink) { - printk("NFS: Buggy server - nlink == 0!\n"); + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; - } hash = nfs_fattr_to_ino_t(fattr); @@ -291,7 +288,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) && fattr->size <= NFS_LIMIT_READDIRPLUS) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ - if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if ((fattr->valid & NFS_ATTR_FATTR_FSID) + && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else @@ -304,28 +302,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else init_special_inode(inode, inode->i_mode, fattr->rdev); + memset(&inode->i_atime, 0, sizeof(inode->i_atime)); + memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); + memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + nfsi->change_attr = 0; + inode->i_size = 0; + inode->i_nlink = 0; + inode->i_uid = -2; + inode->i_gid = -2; + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; - inode->i_atime = fattr->atime; - inode->i_mtime = fattr->mtime; - inode->i_ctime = fattr->ctime; - if (fattr->valid & NFS_ATTR_FATTR_V4) + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode->i_atime = fattr->atime; + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode->i_mtime = fattr->mtime; + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = fattr->ctime; + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) nfsi->change_attr = fattr->change_attr; - inode->i_size = nfs_size_to_loff_t(fattr->size); - inode->i_nlink = fattr->nlink; - inode->i_uid = fattr->uid; - inode->i_gid = fattr->gid; - if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { + if (fattr->valid & NFS_ATTR_FATTR_SIZE) + inode->i_size = nfs_size_to_loff_t(fattr->size); + if (fattr->valid & NFS_ATTR_FATTR_NLINK) + inode->i_nlink = fattr->nlink; + if (fattr->valid & NFS_ATTR_FATTR_OWNER) + inode->i_uid = fattr->uid; + if (fattr->valid & NFS_ATTR_FATTR_GROUP) + inode->i_gid = fattr->gid; + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else { - inode->i_blocks = fattr->du.nfs2.blocks; } nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->access_cache = RB_ROOT; unlock_new_inode(inode); @@ -812,25 +827,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); - if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && - nfsi->change_attr == fattr->pre_change_attr) { + if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) + && (fattr->valid & NFS_ATTR_FATTR_CHANGE) + && nfsi->change_attr == fattr->pre_change_attr) { nfsi->change_attr = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; } /* If we have atomic WCC data, we may update some attributes */ - if ((fattr->valid & NFS_ATTR_WCC) != 0) { - if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) + if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) + && (fattr->valid & NFS_ATTR_FATTR_CTIME) + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + + if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) + && (fattr->valid & NFS_ATTR_FATTR_MTIME) + && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; - } - if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && - nfsi->npages == 0) - i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } + if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) + && (fattr->valid & NFS_ATTR_FATTR_SIZE) + && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) + && nfsi->npages == 0) + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } /** @@ -850,35 +871,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Has the inode gone and changed behind our back? */ - if (nfsi->fileid != fattr->fileid - || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + return -EIO; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - } - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ - if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->npages == 0) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->npages == 0) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } /* Have any file permissions changed? */ - if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) - || inode->i_uid != fattr->uid - || inode->i_gid != fattr->gid) + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ - if (inode->i_nlink != fattr->nlink) + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_ATTR; - if (!timespec_equal(&inode->i_atime, &fattr->atime)) + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) @@ -890,11 +915,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { + if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) + return 0; return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; } static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return 0; return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); } @@ -1030,20 +1059,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { - fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); + fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME); goto out_noforce; } - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && - (fattr->valid & NFS_ATTR_WCC_V4) == 0) { + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; - fattr->valid |= NFS_ATTR_WCC_V4; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } - if ((fattr->valid & NFS_ATTR_FATTR) != 0 && - (fattr->valid & NFS_ATTR_WCC) == 0) { + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->valid |= NFS_ATTR_FATTR_PRECTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->valid |= NFS_ATTR_FATTR_PREMTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { fattr->pre_size = i_size_read(inode); - fattr->valid |= NFS_ATTR_WCC; + fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); @@ -1075,18 +1115,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) __func__, inode->i_sb->s_id, inode->i_ino, atomic_read(&inode->i_count), fattr->valid); - if (nfsi->fileid != fattr->fileid) + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) goto out_fileid; /* * Make sure the inode's type hasn't changed. */ - if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) goto out_changed; server = NFS_SERVER(inode); /* Update the fsid? */ - if (S_ISDIR(inode->i_mode) && + if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) server->fsid = fattr->fsid; @@ -1096,14 +1136,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); /* More cache consistency checks */ - if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (nfsi->change_attr != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + nfsi->change_attr = fattr->change_attr; + } + } + + if (fattr->valid & NFS_ATTR_FATTR_MTIME) { /* NFSv2/v3: Check if the mtime agrees */ if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { dprintk("NFS: mtime change on server for file %s/%ld\n", @@ -1111,7 +1164,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } + } + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; @@ -1122,59 +1178,66 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; nfs_force_lookup_revalidate(inode); } + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } - } else if (nfsi->change_attr != fattr->change_attr) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); } /* Check if our cached file size is stale */ - new_isize = nfs_size_to_loff_t(fattr->size); - cur_isize = i_size_read(inode); - if (new_isize != cur_isize) { - /* Do we perhaps have any outstanding writes, or has - * the file grown beyond our last write? */ - if (nfsi->npages == 0 || new_isize > cur_isize) { - i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes, or has + * the file grown beyond our last write? */ + if (nfsi->npages == 0 || new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + } + dprintk("NFS: isize change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); } - dprintk("NFS: isize change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); } - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - nfsi->change_attr = fattr->change_attr; + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || - inode->i_uid != fattr->uid || - inode->i_gid != fattr->gid) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - - if (inode->i_nlink != fattr->nlink) { - invalid |= NFS_INO_INVALID_ATTR; - if (S_ISDIR(inode->i_mode)) - invalid |= NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_MODE) { + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_mode = fattr->mode; + } + } + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { + if (inode->i_uid != fattr->uid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_uid = fattr->uid; + } + } + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { + if (inode->i_gid != fattr->gid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_gid = fattr->gid; + } } - inode->i_mode = fattr->mode; - inode->i_nlink = fattr->nlink; - inode->i_uid = fattr->uid; - inode->i_gid = fattr->gid; + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + inode->i_nlink = fattr->nlink; + } + } - if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else { - inode->i_blocks = fattr->du.nfs2.blocks; } + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 28bab67d1519..bea99992c302 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -136,7 +136,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_time(p, &fattr->atime); p = xdr_decode_time(p, &fattr->mtime); p = xdr_decode_time(p, &fattr->ctime); - fattr->valid |= NFS_ATTR_FATTR; + fattr->valid |= NFS_ATTR_FATTR_V2; fattr->rdev = new_decode_dev(rdev); if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { fattr->type = NFFIFO; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 6cdeacffde46..c0f7d02aced9 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -177,7 +177,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_time3(p, &fattr->ctime); /* Update the mode bits */ - fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); + fattr->valid |= NFS_ATTR_FATTR_V3; return p; } @@ -233,7 +233,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_hyper(p, &fattr->pre_size); p = xdr_decode_time3(p, &fattr->pre_mtime); p = xdr_decode_time3(p, &fattr->pre_ctime); - fattr->valid |= NFS_ATTR_WCC; + fattr->valid |= NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME; return p; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5f0ee3e2bd84..7d220da3db36 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3012,7 +3012,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) goto xdr_error; fattr->type = nfs_type2fmt[type].nfs2type; - fmode = nfs_type2fmt[type].mode; + fattr->mode = nfs_type2fmt[type].mode; if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) goto xdr_error; @@ -3026,7 +3026,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons struct nfs4_fs_locations, fattr))) != 0) goto xdr_error; - if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) + if ((status = decode_attr_mode(xdr, bitmap, &fmode)) != 0) goto xdr_error; fattr->mode |= fmode; if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) @@ -3050,7 +3050,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons if (fattr->fileid == 0 && fileid != 0) fattr->fileid = fileid; if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) - fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; + fattr->valid = NFS_ATTR_FATTR_V4; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2e5f00066afd..b99295e07cdf 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -27,7 +27,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid } struct nfs_fattr { - unsigned short valid; /* which fields are valid */ + unsigned int valid; /* which fields are valid */ __u64 pre_size; /* pre_op_attr.size */ struct timespec pre_mtime; /* pre_op_attr.mtime */ struct timespec pre_ctime; /* pre_op_attr.ctime */ @@ -59,12 +59,46 @@ struct nfs_fattr { unsigned long gencount; }; -#define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ -#define NFS_ATTR_FATTR 0x0002 /* post-op attributes */ -#define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */ -#define NFS_ATTR_FATTR_V4 0x0008 /* NFSv4 change attribute */ -#define NFS_ATTR_WCC_V4 0x0010 /* pre-op change attribute */ -#define NFS_ATTR_FATTR_V4_REFERRAL 0x0020 /* NFSv4 referral */ +#define NFS_ATTR_FATTR_TYPE (1U << 0) +#define NFS_ATTR_FATTR_MODE (1U << 1) +#define NFS_ATTR_FATTR_NLINK (1U << 2) +#define NFS_ATTR_FATTR_OWNER (1U << 3) +#define NFS_ATTR_FATTR_GROUP (1U << 4) +#define NFS_ATTR_FATTR_RDEV (1U << 5) +#define NFS_ATTR_FATTR_SIZE (1U << 6) +#define NFS_ATTR_FATTR_PRESIZE (1U << 7) +#define NFS_ATTR_FATTR_BLOCKS_USED (1U << 8) +#define NFS_ATTR_FATTR_SPACE_USED (1U << 9) +#define NFS_ATTR_FATTR_FSID (1U << 10) +#define NFS_ATTR_FATTR_FILEID (1U << 11) +#define NFS_ATTR_FATTR_ATIME (1U << 12) +#define NFS_ATTR_FATTR_MTIME (1U << 13) +#define NFS_ATTR_FATTR_CTIME (1U << 14) +#define NFS_ATTR_FATTR_PREMTIME (1U << 15) +#define NFS_ATTR_FATTR_PRECTIME (1U << 16) +#define NFS_ATTR_FATTR_CHANGE (1U << 17) +#define NFS_ATTR_FATTR_PRECHANGE (1U << 18) +#define NFS_ATTR_FATTR_V4_REFERRAL (1U << 19) /* NFSv4 referral */ + +#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \ + | NFS_ATTR_FATTR_MODE \ + | NFS_ATTR_FATTR_NLINK \ + | NFS_ATTR_FATTR_OWNER \ + | NFS_ATTR_FATTR_GROUP \ + | NFS_ATTR_FATTR_RDEV \ + | NFS_ATTR_FATTR_SIZE \ + | NFS_ATTR_FATTR_FSID \ + | NFS_ATTR_FATTR_FILEID \ + | NFS_ATTR_FATTR_ATIME \ + | NFS_ATTR_FATTR_MTIME \ + | NFS_ATTR_FATTR_CTIME) +#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \ + | NFS_ATTR_FATTR_BLOCKS_USED) +#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \ + | NFS_ATTR_FATTR_SPACE_USED) +#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \ + | NFS_ATTR_FATTR_SPACE_USED \ + | NFS_ATTR_FATTR_CHANGE) /* * Info on the file system From 1ca277d88dafdbc3c5a69d32590e7184b9af6371 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:25 -0400 Subject: [PATCH 030/478] NFS: Shrink the struct nfs_fattr We don't need the bitmap[] field anymore, since the 'valid' field tells us all we need to know about which attributes were filled in... Also move the pre-op attributes in order to improve the structure packing. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 3 --- include/linux/nfs_xdr.h | 7 +++---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7d220da3db36..9f1df8361974 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3002,9 +3002,6 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) goto xdr_error; - fattr->bitmap[0] = bitmap[0]; - fattr->bitmap[1] = bitmap[1]; - if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) goto xdr_error; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index b99295e07cdf..6013acb0131f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -28,9 +28,6 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid struct nfs_fattr { unsigned int valid; /* which fields are valid */ - __u64 pre_size; /* pre_op_attr.size */ - struct timespec pre_mtime; /* pre_op_attr.mtime */ - struct timespec pre_ctime; /* pre_op_attr.ctime */ enum nfs_ftype type; /* always use NFSv2 types */ __u32 mode; __u32 nlink; @@ -52,9 +49,11 @@ struct nfs_fattr { struct timespec atime; struct timespec mtime; struct timespec ctime; - __u32 bitmap[2]; /* NFSv4 returned attribute bitmap */ __u64 change_attr; /* NFSv4 change attribute */ __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ + __u64 pre_size; /* pre_op_attr.size */ + struct timespec pre_mtime; /* pre_op_attr.mtime */ + struct timespec pre_ctime; /* pre_op_attr.ctime */ unsigned long time_start; unsigned long gencount; }; From bca794785c2c12ecddeb09e70165b8ff80baa6ae Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:26 -0400 Subject: [PATCH 031/478] NFS: Fix the type of struct nfs_fattr->mode There is no point in using anything other than umode_t, since we copy the content pretty much directly into inode->i_mode. Signed-off-by: Trond Myklebust --- fs/nfs/getroot.c | 4 ++-- fs/nfs/nfs2xdr.c | 7 +++---- fs/nfs/nfs3xdr.c | 31 +++++++++++++------------------ fs/nfs/nfs4xdr.c | 40 +++++++++++++++++++--------------------- include/linux/nfs_xdr.h | 3 +-- 5 files changed, 38 insertions(+), 47 deletions(-) diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b7c9b2df1f29..46177cb87064 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server, return ret; } - if (fattr.type != NFDIR) { + if (!S_ISDIR(fattr.mode)) { printk(KERN_ERR "nfs4_get_root:" " getroot encountered non-directory\n"); return -ENOTDIR; @@ -213,7 +213,7 @@ eat_dot_dir: return ret; } - if (fattr.type != NFDIR) { + if (!S_ISDIR(fattr.mode)) { printk(KERN_ERR "nfs4_get_root:" " lookupfh encountered non-directory\n"); return -ENOTDIR; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index bea99992c302..c862c9340f9a 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep) static __be32 * xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) { - u32 rdev; - fattr->type = (enum nfs_ftype) ntohl(*p++); + u32 rdev, type; + type = ntohl(*p++); fattr->mode = ntohl(*p++); fattr->nlink = ntohl(*p++); fattr->uid = ntohl(*p++); @@ -138,8 +138,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_time(p, &fattr->ctime); fattr->valid |= NFS_ATTR_FATTR_V2; fattr->rdev = new_decode_dev(rdev); - if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { - fattr->type = NFFIFO; + if (type == NFCHR && rdev == NFS2_FIFO_DEV) { fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; fattr->rdev = 0; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index c0f7d02aced9..e6a1932c7110 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -91,19 +91,15 @@ /* * Map file type to S_IFMT bits */ -static struct { - unsigned int mode; - unsigned int nfs2type; -} nfs_type2fmt[] = { - { 0, NFNON }, - { S_IFREG, NFREG }, - { S_IFDIR, NFDIR }, - { S_IFBLK, NFBLK }, - { S_IFCHR, NFCHR }, - { S_IFLNK, NFLNK }, - { S_IFSOCK, NFSOCK }, - { S_IFIFO, NFFIFO }, - { 0, NFBAD } +static const umode_t nfs_type2fmt[] = { + [NF3BAD] = 0, + [NF3REG] = S_IFREG, + [NF3DIR] = S_IFDIR, + [NF3BLK] = S_IFBLK, + [NF3CHR] = S_IFCHR, + [NF3LNK] = S_IFLNK, + [NF3SOCK] = S_IFSOCK, + [NF3FIFO] = S_IFIFO, }; /* @@ -148,13 +144,12 @@ static __be32 * xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) { unsigned int type, major, minor; - int fmode; + umode_t fmode; type = ntohl(*p++); - if (type >= NF3BAD) - type = NF3BAD; - fmode = nfs_type2fmt[type].mode; - fattr->type = nfs_type2fmt[type].nfs2type; + if (type > NF3FIFO) + type = NF3NON; + fmode = nfs_type2fmt[type]; fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; fattr->nlink = ntohl(*p++); fattr->uid = ntohl(*p++); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9f1df8361974..c1906d2a226b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int); decode_lookup_maxsz + \ decode_fs_locations_maxsz) -static struct { - unsigned int mode; - unsigned int nfs2type; -} nfs_type2fmt[] = { - { 0, NFNON }, - { S_IFREG, NFREG }, - { S_IFDIR, NFDIR }, - { S_IFBLK, NFBLK }, - { S_IFCHR, NFCHR }, - { S_IFLNK, NFLNK }, - { S_IFSOCK, NFSOCK }, - { S_IFIFO, NFFIFO }, - { 0, NFNON }, - { 0, NFNON }, +static const umode_t nfs_type2fmt[] = { + [NF4BAD] = 0, + [NF4REG] = S_IFREG, + [NF4DIR] = S_IFDIR, + [NF4BLK] = S_IFBLK, + [NF4CHR] = S_IFCHR, + [NF4LNK] = S_IFLNK, + [NF4SOCK] = S_IFSOCK, + [NF4FIFO] = S_IFIFO, + [NF4ATTRDIR] = 0, + [NF4NAMEDATTR] = 0, }; struct compound_hdr { @@ -2173,7 +2170,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } bitmap[0] &= ~FATTR4_WORD0_TYPE; } - dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type); + dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return 0; } @@ -2580,8 +2577,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return status; } -static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) +static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) { + uint32_t tmp; __be32 *p; *mode = 0; @@ -2589,8 +2587,8 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { READ_BUF(4); - READ32(*mode); - *mode &= ~S_IFMT; + READ32(tmp); + *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); @@ -2994,7 +2992,8 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons uint32_t attrlen, bitmap[2] = {0}, type; - int status, fmode = 0; + int status; + umode_t fmode = 0; uint64_t fileid; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3008,8 +3007,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) goto xdr_error; - fattr->type = nfs_type2fmt[type].nfs2type; - fattr->mode = nfs_type2fmt[type].mode; + fattr->mode = nfs_type2fmt[type]; if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) goto xdr_error; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6013acb0131f..0691b9c188d9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -28,8 +28,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid struct nfs_fattr { unsigned int valid; /* which fields are valid */ - enum nfs_ftype type; /* always use NFSv2 types */ - __u32 mode; + umode_t mode; __u32 nlink; __u32 uid; __u32 gid; From f26c7a78876ccd6c9b477ab4ca127aa1a4ef68c7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:26 -0400 Subject: [PATCH 032/478] NFSv4: Clean up decode_getfattr() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 80 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c1906d2a226b..43c6e50ff173 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2996,55 +2996,91 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons umode_t fmode = 0; uint64_t fileid; - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) - goto xdr_error; - if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + status = decode_op_hdr(xdr, OP_GETATTR); + if (status < 0) goto xdr_error; - if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; + + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) goto xdr_error; - if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) + status = decode_attr_type(xdr, bitmap, &type); + if (status < 0) goto xdr_error; fattr->mode = nfs_type2fmt[type]; - if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) + status = decode_attr_change(xdr, bitmap, &fattr->change_attr); + if (status < 0) goto xdr_error; - if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) + + status = decode_attr_size(xdr, bitmap, &fattr->size); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) + + status = decode_attr_fsid(xdr, bitmap, &fattr->fsid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) + + status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, + + status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, struct nfs4_fs_locations, - fattr))) != 0) + fattr)); + if (status < 0) goto xdr_error; - if ((status = decode_attr_mode(xdr, bitmap, &fmode)) != 0) + + status = decode_attr_mode(xdr, bitmap, &fmode); + if (status < 0) goto xdr_error; fattr->mode |= fmode; - if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) + + status = decode_attr_nlink(xdr, bitmap, &fattr->nlink); + if (status < 0) goto xdr_error; - if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) + + status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) + + status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) + + status = decode_attr_rdev(xdr, bitmap, &fattr->rdev); + if (status < 0) goto xdr_error; - if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) + + status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) + + status = decode_attr_time_access(xdr, bitmap, &fattr->atime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) + + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) + + status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) + + status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); + if (status < 0) goto xdr_error; if (fattr->fileid == 0 && fileid != 0) fattr->fileid = fileid; - if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) + + status = verify_attr_len(xdr, savep, attrlen); + if (status == 0) fattr->valid = NFS_ATTR_FATTR_V4; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); From 409924e4c943072a63c43bb6b77576bf12f1896b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:27 -0400 Subject: [PATCH 033/478] NFSv4: Make decode_getfattr() set fattr->valid to reflect what was decoded Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 92 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 43c6e50ff173..1690f0e44b91 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2157,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) { __be32 *p; + int ret = 0; *type = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) @@ -2169,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * return -EIO; } bitmap[0] &= ~FATTR4_WORD0_TYPE; + ret = NFS_ATTR_FATTR_TYPE; } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); - return 0; + return ret; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) { __be32 *p; + int ret = 0; *change = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) @@ -2185,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t READ_BUF(8); READ64(*change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; + ret = NFS_ATTR_FATTR_CHANGE; } dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); - return 0; + return ret; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) { __be32 *p; + int ret = 0; *size = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) @@ -2202,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * READ_BUF(8); READ64(*size); bitmap[0] &= ~FATTR4_WORD0_SIZE; + ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); - return 0; + return ret; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2242,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) { __be32 *p; + int ret = 0; fsid->major = 0; fsid->minor = 0; @@ -2252,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs READ64(fsid->major); READ64(fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; + ret = NFS_ATTR_FATTR_FSID; } dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, (unsigned long long)fsid->major, (unsigned long long)fsid->minor); - return 0; + return ret; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2294,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; + int ret = 0; *fileid = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) @@ -2302,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t READ_BUF(8); READ64(*fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; + ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); - return 0; + return ret; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; + int ret = 0; *fileid = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) @@ -2318,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma READ_BUF(8); READ64(*fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); - return 0; + return ret; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2476,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) res->nlocations++; } + if (res->nlocations != 0) + status = NFS_ATTR_FATTR_V4_REFERRAL; out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; @@ -2581,6 +2595,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m { uint32_t tmp; __be32 *p; + int ret = 0; *mode = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) @@ -2590,14 +2605,16 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m READ32(tmp); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; + ret = NFS_ATTR_FATTR_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); - return 0; + return ret; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) { __be32 *p; + int ret = 0; *nlink = 1; if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) @@ -2606,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t READ_BUF(4); READ32(*nlink); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; + ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); - return 0; + return ret; } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) { uint32_t len; __be32 *p; + int ret = 0; *uid = -2; if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) @@ -2624,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf READ32(len); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) + if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) + ret = NFS_ATTR_FATTR_OWNER; + else dprintk("%s: nfs_map_name_to_uid failed!\n", __func__); } else @@ -2633,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf bitmap[1] &= ~FATTR4_WORD1_OWNER; } dprintk("%s: uid=%d\n", __func__, (int)*uid); - return 0; + return ret; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) { uint32_t len; __be32 *p; + int ret = 0; *gid = -2; if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) @@ -2649,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf READ32(len); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) + if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) + ret = NFS_ATTR_FATTR_GROUP; + else dprintk("%s: nfs_map_group_to_gid failed!\n", __func__); } else @@ -2658,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; } dprintk("%s: gid=%d\n", __func__, (int)*gid); - return 0; + return ret; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) { uint32_t major = 0, minor = 0; __be32 *p; + int ret = 0; *rdev = MKDEV(0,0); if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) @@ -2679,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; + ret = NFS_ATTR_FATTR_RDEV; } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); - return 0; + return ret; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2738,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) { __be32 *p; + int ret = 0; *used = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) @@ -2746,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint READ_BUF(8); READ64(*used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; + ret = NFS_ATTR_FATTR_SPACE_USED; } dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); - return 0; + return ret; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -2776,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_ATIME; bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; } dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); @@ -2792,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_CTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; } dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); @@ -2808,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_MTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; } dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); @@ -3012,76 +3046,96 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons status = decode_attr_type(xdr, bitmap, &type); if (status < 0) goto xdr_error; - fattr->mode = nfs_type2fmt[type]; + fattr->mode = 0; + if (status != 0) { + fattr->mode |= nfs_type2fmt[type]; + fattr->valid |= status; + } status = decode_attr_change(xdr, bitmap, &fattr->change_attr); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_size(xdr, bitmap, &fattr->size); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_fsid(xdr, bitmap, &fattr->fsid); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, struct nfs4_fs_locations, fattr)); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_mode(xdr, bitmap, &fmode); if (status < 0) goto xdr_error; - fattr->mode |= fmode; + if (status != 0) { + fattr->mode |= fmode; + fattr->valid |= status; + } status = decode_attr_nlink(xdr, bitmap, &fattr->nlink); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_rdev(xdr, bitmap, &fattr->rdev); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_time_access(xdr, bitmap, &fattr->atime); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime); if (status < 0) goto xdr_error; + fattr->valid |= status; status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); if (status < 0) goto xdr_error; - if (fattr->fileid == 0 && fileid != 0) + if (status != 0 && !(fattr->valid & status)) { fattr->fileid = fileid; + fattr->valid |= status; + } status = verify_attr_len(xdr, savep, attrlen); - if (status == 0) - fattr->valid = NFS_ATTR_FATTR_V4; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; From 69aaaae18f7027d9594bce100378f102926cc0be Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:28 -0400 Subject: [PATCH 034/478] NFSv4: A referral is assumed to always point to a directory. Fix a bug whereby we would fail to create a mount point for a referral. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8dde84b988d9..aa433d077945 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3678,6 +3678,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) return len; } +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) +{ + if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) + return; + + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page) { @@ -3704,6 +3717,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, fs_locations->server = server; fs_locations->nlocations = 0; status = rpc_call_sync(server->client, &msg, 0); + nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } From a65318bf3afc93ce49227e849d213799b072c5fd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:28 -0400 Subject: [PATCH 035/478] NFSv4: Simplify some cache consistency post-op GETATTRs Certain asynchronous operations such as write() do not expect (or care) that other metadata such as the file owner, mode, acls, ... change. All they want to do is update and/or check the change attribute, ctime, and mtime. By skipping the file owner and group update, we also avoid having to do a potential idmapper upcall for these asynchronous RPC calls. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 13 ++++++++----- include/linux/nfs_fs_sb.h | 5 +++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index aa433d077945..101f5f4c304f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1439,7 +1439,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) if (calldata->arg.seqid == NULL) goto out_free_calldata; calldata->arg.fmode = 0; - calldata->arg.bitmask = server->attr_bitmask; + calldata->arg.bitmask = server->cache_consistency_bitmask; calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; @@ -1600,6 +1600,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); + server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; + server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->acl_bitmask = res.acl_bitmask; } return status; @@ -2079,7 +2082,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; - args->bitmask = server->attr_bitmask; + args->bitmask = server->cache_consistency_bitmask; res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; } @@ -2323,7 +2326,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .pages = &page, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask, }; struct nfs4_readdir_res res; struct rpc_message msg = { @@ -2552,7 +2555,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->attr_bitmask; + data->args.bitmask = server->cache_consistency_bitmask; data->res.server = server; data->timestamp = jiffies; @@ -2575,7 +2578,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->attr_bitmask; + data->args.bitmask = server->cache_consistency_bitmask; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; } diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 9bb81aec91cf..29b1e40dce99 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -106,6 +106,11 @@ struct nfs_server { u32 attr_bitmask[2];/* V4 bitmask representing the set of attributes supported on this filesystem */ + u32 cache_consistency_bitmask[2]; + /* V4 bitmask representing the subset + of change attribute, size, ctime + and mtime attributes supported by + the server */ u32 acl_bitmask; /* V4 bitmask representing the ACEs that are supported on this filesystem */ From fb8a1f11b64e213d94dfa1cebb2a42a7b8c115c4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:29 -0400 Subject: [PATCH 036/478] NFS: cleanup - remove struct nfs_inode->ncommit Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 - fs/nfs/write.c | 25 ++++++++++++++++--------- include/linux/nfs_fs.h | 3 +-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b7656bd3706f..00f116cdadc6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1345,7 +1345,6 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); - nfsi->ncommit = 0; nfsi->npages = 0; atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9f9845859fc1..1a999939fedf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -404,7 +404,6 @@ nfs_mark_request_commit(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - nfsi->ncommit++; set_bit(PG_CLEAN, &(req)->wb_flags); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, @@ -523,6 +522,12 @@ static void nfs_cancel_commit_list(struct list_head *head) } } +static int +nfs_need_commit(struct nfs_inode *nfsi) +{ + return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); +} + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) /* * nfs_scan_commit - Scan an inode for commit requests @@ -538,16 +543,18 @@ static int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res = 0; - if (nfsi->ncommit != 0) { - res = nfs_scan_list(nfsi, dst, idx_start, npages, - NFS_PAGE_TAG_COMMIT); - nfsi->ncommit -= res; - } - return res; + if (!nfs_need_commit(nfsi)) + return 0; + + return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); } #else +static inline int nfs_need_commit(struct nfs_inode *nfsi) +{ + return 0; +} + static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { return 0; @@ -820,7 +827,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; - if (!NFS_I(inode)->ncommit) + if (!nfs_need_commit(NFS_I(inode))) data->args.stable = NFS_FILE_SYNC; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index db867b04ac3c..c9fecd3e8f0f 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -166,8 +166,7 @@ struct nfs_inode { */ struct radix_tree_root nfs_page_tree; - unsigned long ncommit, - npages; + unsigned long npages; /* Open contexts for shared mmap writes */ struct list_head open_files; From 72cb77f4a5ace37b12dcb47a0e8637a2c28ad881 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:10:30 -0400 Subject: [PATCH 037/478] NFS: Throttle page dirtying while we're flushing to disk The following patch is a combination of a patch by myself and Peter Staubach. Trond: If we allow other processes to dirty pages while a process is doing a consistency sync to disk, we can end up never making progress. Peter: Attached is a patch which addresses a continuing problem with the NFS client generating out of order WRITE requests. While this is compliant with all of the current protocol specifications, there are servers in the market which can not handle out of order WRITE requests very well. Also, this may lead to sub-optimal block allocations in the underlying file system on the server. This may cause the read throughputs to be reduced when reading the file from the server. Peter: There has been a lot of work recently done to address out of order issues on a systemic level. However, the NFS client is still susceptible to the problem. Out of order WRITE requests can occur when pdflush is in the middle of writing out pages while the process dirtying the pages calls generic_file_buffered_write which calls generic_perform_write which calls balance_dirty_pages_rate_limited which ends up calling writeback_inodes which ends up calling back into the NFS client to writes out dirty pages for the same file that pdflush happens to be working with. Signed-off-by: Peter Staubach [modification by Trond to merge the two similar patches] Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 9 +++++++++ fs/nfs/inode.c | 12 ++++++++++++ fs/nfs/internal.h | 1 + fs/nfs/nfs4proc.c | 10 +--------- fs/nfs/pagelist.c | 11 ----------- fs/nfs/write.c | 28 +++++++++++++++++++--------- include/linux/nfs_fs.h | 1 + 7 files changed, 43 insertions(+), 29 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 90f292b520d2..404c19c866a7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -354,6 +354,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); + /* + * Prevent starvation issues if someone is doing a consistency + * sync-to-disk + */ + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + return ret; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 00f116cdadc6..c40adc5dd609 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -65,6 +65,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } +/** + * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +int nfs_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 340ede8f608f..a55e69aa52e5 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -165,6 +165,7 @@ extern void nfs_clear_inode(struct inode *); extern void nfs4_clear_inode(struct inode *); #endif void nfs_zap_acl_cache(struct inode *inode); +extern int nfs_wait_bit_killable(void *word); /* super.c */ void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 101f5f4c304f..95f171e7e05a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start, KM_USER0); } -static int nfs4_wait_bit_killable(void *word) -{ - if (fatal_signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - static int nfs4_wait_clnt_recover(struct nfs_client *clp) { int res; @@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs4_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, TASK_KILLABLE); return res; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7f079209d70a..e2975939126a 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req) kref_put(&req->wb_kref, nfs_free_request); } -static int nfs_wait_bit_killable(void *word) -{ - int ret = 0; - - if (fatal_signal_pending(current)) - ret = -ERESTARTSYS; - else - schedule(); - return ret; -} - /** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1a999939fedf..36fd35e0de83 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; + /* Stop dirtying of new pages while we sync */ + err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (err) + goto out_err; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); + + clear_bit_unlock(NFS_INO_FLUSHING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_FLUSHING); + if (err < 0) - return err; - if (pgio.pg_error < 0) - return pgio.pg_error; + goto out_err; + err = pgio.pg_error; + if (err < 0) + goto out_err; return 0; +out_err: + return err; } /* @@ -1432,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how) { struct writeback_control wbc = { .bdi = mapping->backing_dev_info, - .sync_mode = WB_SYNC_NONE, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, .for_writepages = 1, }; - int ret; - ret = __nfs_write_mapping(mapping, &wbc, how); - if (ret < 0) - return ret; - wbc.sync_mode = WB_SYNC_ALL; return __nfs_write_mapping(mapping, &wbc, how); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c9fecd3e8f0f..933bc261c0df 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -206,6 +206,7 @@ struct nfs_inode { #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */ +#define NFS_INO_FLUSHING (4) /* inode is flushing out data */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { From e1ebfd33be068ec933f8954060a499bd22ad6f69 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:54 -0400 Subject: [PATCH 038/478] NFS: Kill the "defined but not used" compile error on nommu machines Bryan Wu reports that when compiling NFS on nommu machines he gets a "defined but not used" error on nfs_file_mmap(). The easiest fix is simply to get rid of the special casing in NFS, and just always call generic_file_mmap() to set up the file. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 404c19c866a7..1eab9c9ad242 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -64,11 +64,7 @@ const struct file_operations nfs_file_operations = { .write = do_sync_write, .aio_read = nfs_file_read, .aio_write = nfs_file_write, -#ifdef CONFIG_MMU .mmap = nfs_file_mmap, -#else - .mmap = generic_file_mmap, -#endif .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, @@ -304,11 +300,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) dprintk("NFS: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); - status = nfs_revalidate_mapping(inode, file->f_mapping); + /* Note: generic_file_mmap() returns ENOSYS on nommu systems + * so we call that before revalidating the mapping + */ + status = generic_file_mmap(file, vma); if (!status) { vma->vm_ops = &nfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; - file_accessed(file); + status = nfs_revalidate_mapping(inode, file->f_mapping); } return status; } From b1e1e158779f1d99c2cc18e466f6bf9099fc0853 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Wed, 11 Mar 2009 14:37:55 -0400 Subject: [PATCH 039/478] SVCRDMA: remove faulty assertions in rpc/rdma chunk validation. Certain client-provided RPCRDMA chunk alignments result in an additional scatter/gather entry, which triggered nfs/rdma server assertions incorrectly. OpenSolaris nfs/rdma client connectathon testing was blocked by these in the special/locking section. Signed-off-by: Tom Talpey Cc: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index a3334e3b73cc..d0bea987d80e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -191,7 +191,6 @@ static int map_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, struct svc_rdma_req_map *vec) { - int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; u32 sge_bytes; u32 page_bytes; @@ -235,7 +234,11 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - BUG_ON(sge_no > sge_max); + dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + "page_base %zd page_len %zd head_len %d tail_len %d\n", + sge_no, page_no, xdr->page_base, xdr->page_len, + xdr->head[0].iov_len, xdr->tail[0].iov_len); + vec->count = sge_no; return 0; } @@ -579,7 +582,6 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[page_no+1].length = 0; } BUG_ON(sge_no > rdma->sc_max_sge); - BUG_ON(sge_no > ctxt->count); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; send_wr.wr_id = (unsigned long)ctxt; From b38ab40ad58c1fc43ea590d6342f6a6763ac8fb6 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Wed, 11 Mar 2009 14:37:55 -0400 Subject: [PATCH 040/478] XPRTRDMA: correct an rpc/rdma inline send marshaling error Certain client rpc's which contain both lengthy page-contained metadata and a non-empty xdr_tail buffer require careful handling to avoid overlapped memory copying. Rearranging of existing rpcrdma marshaling code avoids it; this fixes an NFSv4 symlink creation error detected with connectathon basic/test8 to multiple servers. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 14106d26bb95..e5e28d1946a4 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -310,6 +310,19 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) __func__, pad, destp, rqst->rq_slen, curlen); copy_len = rqst->rq_snd_buf.page_len; + + if (rqst->rq_snd_buf.tail[0].iov_len) { + curlen = rqst->rq_snd_buf.tail[0].iov_len; + if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { + memmove(destp + copy_len, + rqst->rq_snd_buf.tail[0].iov_base, curlen); + r_xprt->rx_stats.pullup_copy_count += curlen; + } + dprintk("RPC: %s: tail destp 0x%p len %d\n", + __func__, destp + copy_len, curlen); + rqst->rq_svec[0].iov_len += curlen; + } + r_xprt->rx_stats.pullup_copy_count += copy_len; npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; for (i = 0; copy_len && i < npages; i++) { @@ -332,17 +345,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) destp += curlen; copy_len -= curlen; } - if (rqst->rq_snd_buf.tail[0].iov_len) { - curlen = rqst->rq_snd_buf.tail[0].iov_len; - if (destp != rqst->rq_snd_buf.tail[0].iov_base) { - memcpy(destp, - rqst->rq_snd_buf.tail[0].iov_base, curlen); - r_xprt->rx_stats.pullup_copy_count += curlen; - } - dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n", - __func__, destp, copy_len, curlen); - rqst->rq_svec[0].iov_len += curlen; - } /* header now contains entire send message */ return pad; } @@ -656,7 +658,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) curlen = rqst->rq_rcv_buf.tail[0].iov_len; if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) - memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); + memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", __func__, srcp, copy_len, curlen); rqst->rq_rcv_buf.tail[0].iov_len = curlen; From 441e3e242903f9b190d5764bed73edb58f977413 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Wed, 11 Mar 2009 14:37:56 -0400 Subject: [PATCH 041/478] SUNRPC: dynamically load RPC transport modules on-demand Provide an api to attempt to load any necessary kernel RPC client transport module automatically. By convention, the desired module name is "xprt"+"transport name". For example, when NFS mounting with "-o proto=rdma", attempt to load the "xprtrdma" module. Signed-off-by: Tom Talpey Cc: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 11fc71d50c1e..2b0d960603b9 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -235,6 +235,7 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 * */ int xprt_register_transport(struct xprt_class *type); int xprt_unregister_transport(struct xprt_class *type); +int xprt_load_transport(const char *); void xprt_set_retrans_timeout_def(struct rpc_task *task); void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 62098d101a1f..d1afec640394 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -151,6 +151,37 @@ out: } EXPORT_SYMBOL_GPL(xprt_unregister_transport); +/** + * xprt_load_transport - load a transport implementation + * @transport_name: transport to load + * + * Returns: + * 0: transport successfully loaded + * -ENOENT: transport module not available + */ +int xprt_load_transport(const char *transport_name) +{ + struct xprt_class *t; + char module_name[sizeof t->name + 5]; + int result; + + result = 0; + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + if (strcmp(t->name, transport_name) == 0) { + spin_unlock(&xprt_list_lock); + goto out; + } + } + spin_unlock(&xprt_list_lock); + strcpy(module_name, "xprt"); + strncat(module_name, transport_name, sizeof t->name); + result = request_module(module_name); +out: + return result; +} +EXPORT_SYMBOL_GPL(xprt_load_transport); + /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport From a67d18f89f5782806135aad4ee012ff78d45aae7 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Wed, 11 Mar 2009 14:37:56 -0400 Subject: [PATCH 042/478] NFS: load the rpc/rdma transport module automatically When mounting an NFS/RDMA server with the "-o proto=rdma" or "-o rdma" options, attempt to dynamically load the necessary "xprtrdma" client transport module. Doing so improves usability, while avoiding a static module dependency and any unnecesary resources. Signed-off-by: Tom Talpey Cc: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d6686f4786dc..0942fcbbad3c 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1018,6 +1018,7 @@ static int nfs_parse_mount_options(char *raw, case Opt_rdma: mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(p); break; case Opt_acl: mnt->flags &= ~NFS_MOUNT_NOACL; @@ -1205,12 +1206,14 @@ static int nfs_parse_mount_options(char *raw, /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(string); break; default: errors++; dfprintk(MOUNT, "NFS: unrecognized " "transport protocol\n"); } + kfree(string); break; case Opt_mountproto: string = match_strdup(args); @@ -1218,7 +1221,6 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); - kfree(string); switch (token) { case Opt_xprt_udp: From 15f081ca8ddfe150fb639c591b18944a539da0fc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:57 -0400 Subject: [PATCH 043/478] SUNRPC: Avoid an unnecessary task reschedule on ENOTCONN If the socket is unconnected, and xprt_transmit() returns ENOTCONN, we currently give up the lock on the transport channel. Doing so means that the lock automatically gets assigned to the next task in the xprt->sending queue, and so that task needs to be woken up to do the actual connect. The following patch aims to avoid that unnecessary task switch. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 836f15c0c4a3..07e9b05321e6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1105,14 +1105,24 @@ static void call_transmit_status(struct rpc_task *task) { task->tk_action = call_status; - /* - * Special case: if we've been waiting on the socket's write_space() - * callback, then don't call xprt_end_transmit(). - */ - if (task->tk_status == -EAGAIN) - return; - xprt_end_transmit(task); - rpc_task_force_reencode(task); + switch (task->tk_status) { + case -EAGAIN: + break; + default: + xprt_end_transmit(task); + /* + * Special cases: if we've been waiting on the + * socket's write_space() callback, or if the + * socket just returned a connection error, + * then hold onto the transport lock. + */ + case -ECONNREFUSED: + case -ENOTCONN: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + rpc_task_force_reencode(task); + } } /* From 670f94573104b4a25525d3fcdcd6496c678df172 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:58 -0400 Subject: [PATCH 044/478] SUNRPC: Ensure we set XPRT_CLOSING only after we've sent a tcp FIN... ...so that we can distinguish between when we need to shutdown and when we don't. Also remove the call to xs_tcp_shutdown() from xs_tcp_connect(), since xprt_connect() makes the same test. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1127eb934136..cb4bd93b9211 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1180,7 +1180,6 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ - set_bit(XPRT_CLOSING, &xprt->state); xprt_force_disconnect(xprt); case TCP_SYN_SENT: xprt->connect_cookie++; @@ -1193,6 +1192,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; break; case TCP_LAST_ACK: + set_bit(XPRT_CLOSING, &xprt->state); smp_mb__before_clear_bit(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_clear_bit(); @@ -1836,9 +1836,6 @@ static void xs_tcp_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - /* Initiate graceful shutdown of the socket if not already done */ - if (test_bit(XPRT_CONNECTED, &xprt->state)) - xs_tcp_shutdown(xprt); /* Exit if we need to wait for socket shutdown to complete */ if (test_bit(XPRT_CLOSING, &xprt->state)) return; From 40d2549db5f515e415894def98b49db7d4c56714 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:58 -0400 Subject: [PATCH 045/478] SUNRPC: Don't disconnect if a connection is still in progress. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index cb4bd93b9211..9d1898f6ee87 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1613,10 +1613,9 @@ out: * We need to preserve the port number so the reply cache on the server can * find our cached RPC replies when we get around to reconnecting. */ -static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) +static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) { int result; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sockaddr any; dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); @@ -1633,6 +1632,17 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) result); } +static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) +{ + unsigned int state = transport->inet->sk_state; + + if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) + return; + if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) + return; + xs_abort_connection(xprt, transport); +} + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1706,7 +1716,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work) } } else /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt); + xs_tcp_reuse_connection(xprt, transport); dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); @@ -1766,7 +1776,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work) } } else /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt); + xs_tcp_reuse_connection(xprt, transport); dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); From c8485e4d634f6df155040293928707f127f0d06d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:59 -0400 Subject: [PATCH 046/478] SUNRPC: Handle ECONNREFUSED correctly in xprt_transmit() If we get an ECONNREFUSED error, we currently go to sleep on the 'xprt->sending' wait queue. The problem is that no timeout is set there, and there is nothing else that will wake the task up later. We should deal with ECONNREFUSED in call_status, given that is where we also deal with -EHOSTDOWN, and friends. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 7 ++++++- net/sunrpc/xprt.c | 40 +++++++++++++++++----------------------- net/sunrpc/xprtsock.c | 26 ++++++++++++-------------- 3 files changed, 35 insertions(+), 38 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 07e9b05321e6..145715b53115 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1117,10 +1117,12 @@ call_transmit_status(struct rpc_task *task) * then hold onto the transport lock. */ case -ECONNREFUSED: + case -ECONNRESET: case -ENOTCONN: case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -EPIPE: rpc_task_force_reencode(task); } } @@ -1162,9 +1164,12 @@ call_status(struct rpc_task *task) xprt_conditional_disconnect(task->tk_xprt, req->rq_connect_cookie); break; + case -ECONNRESET: case -ECONNREFUSED: - case -ENOTCONN: rpc_force_rebind(clnt); + rpc_delay(task, 3*HZ); + case -EPIPE: + case -ENOTCONN: task->tk_action = call_bind; break; case -EAGAIN: diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d1afec640394..d588e755e107 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -901,32 +901,26 @@ void xprt_transmit(struct rpc_task *task) req->rq_connect_cookie = xprt->connect_cookie; req->rq_xtime = jiffies; status = xprt->ops->send_request(task); - if (status == 0) { - dprintk("RPC: %5u xmit complete\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); - - xprt->ops->set_retrans_timeout(task); - - xprt->stat.sends++; - xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; - xprt->stat.bklog_u += xprt->backlog.qlen; - - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, xprt_timer); - spin_unlock_bh(&xprt->transport_lock); + if (status != 0) { + task->tk_status = status; return; } - /* Note: at this point, task->tk_sleeping has not yet been set, - * hence there is no danger of the waking up task being put on - * schedq, and being picked up by a parallel run of rpciod(). - */ - task->tk_status = status; - if (status == -ECONNREFUSED) - rpc_sleep_on(&xprt->sending, task, NULL); + dprintk("RPC: %5u xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->transport_lock); + + xprt->ops->set_retrans_timeout(task); + + xprt->stat.sends++; + xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; + xprt->stat.bklog_u += xprt->backlog.qlen; + + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, xprt_timer); + spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9d1898f6ee87..5e8198bede81 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -594,6 +594,8 @@ static int xs_udp_send_request(struct rpc_task *task) /* Still some bytes left; set up for a retry later. */ status = -EAGAIN; } + if (!transport->sock) + goto out; switch (status) { case -ENOTSOCK: @@ -603,19 +605,17 @@ static int xs_udp_send_request(struct rpc_task *task) case -EAGAIN: xs_nospace(task); break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); case -ENETUNREACH: case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - break; - default: - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); } - +out: return status; } @@ -697,6 +697,8 @@ static int xs_tcp_send_request(struct rpc_task *task) status = -EAGAIN; break; } + if (!transport->sock) + goto out; switch (status) { case -ENOTSOCK: @@ -706,21 +708,17 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: xs_nospace(task); break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); case -ECONNRESET: xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: case -EPIPE: - status = -ENOTCONN; clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - xs_tcp_shutdown(xprt); } - +out: return status; } From 482f32e65d31cbf88d08306fa5d397cc945c3c26 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:00 -0400 Subject: [PATCH 047/478] SUNRPC: Handle socket errors correctly Ensure that we pick up and handle socket errors as they occur. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5e8198bede81..879af6f27b4c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1208,23 +1208,20 @@ static void xs_tcp_state_change(struct sock *sk) } /** - * xs_tcp_error_report - callback mainly for catching RST events + * xs_error_report - callback mainly for catching socket errors * @sk: socket */ -static void xs_tcp_error_report(struct sock *sk) +static void xs_error_report(struct sock *sk) { struct rpc_xprt *xprt; read_lock(&sk->sk_callback_lock); - if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED) - goto out; if (!(xprt = xprt_from_sock(sk))) goto out; dprintk("RPC: %s client %p...\n" "RPC: error %d\n", __func__, xprt, sk->sk_err); - - xprt_force_disconnect(xprt); + xprt_wake_pending_tasks(xprt, -EAGAIN); out: read_unlock(&sk->sk_callback_lock); } @@ -1509,6 +1506,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_no_check = UDP_CSUM_NORCV; sk->sk_allocation = GFP_ATOMIC; @@ -1656,7 +1654,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - sk->sk_error_report = xs_tcp_error_report; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ From 2a4919919a97911b0aa4b9f5ac1eab90ba87652b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:00 -0400 Subject: [PATCH 048/478] SUNRPC: Return EAGAIN instead of ENOTCONN when waking up xprt->pending While we should definitely return socket errors to the task that is currently trying to send data, there is no need to propagate the same error to all the other tasks on xprt->pending. Doing so actually slows down recovery, since it causes more than one tasks to attempt socket recovery. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 15 +++-------- net/sunrpc/xprt.c | 20 +++++---------- net/sunrpc/xprtsock.c | 58 +++++++++++++++++++++++-------------------- 3 files changed, 41 insertions(+), 52 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 145715b53115..5abab094441f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1032,27 +1032,20 @@ call_connect_status(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - if (status >= 0) { + if (status >= 0 || status == -EAGAIN) { clnt->cl_stats->netreconn++; task->tk_action = call_transmit; return; } - /* Something failed: remote service port may have changed */ - rpc_force_rebind(clnt); - switch (status) { - case -ENOTCONN: - case -EAGAIN: - task->tk_action = call_bind; - if (!RPC_IS_SOFT(task)) - return; /* if soft mounted, test if we've timed out */ case -ETIMEDOUT: task->tk_action = call_timeout; - return; + break; + default: + rpc_exit(task, -EIO); } - rpc_exit(task, -EIO); } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d588e755e107..a0bfe53f1621 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -611,7 +611,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_disconnect_done); @@ -629,7 +629,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(rpciod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -656,7 +656,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(rpciod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -EAGAIN); out: spin_unlock_bh(&xprt->transport_lock); } @@ -726,9 +726,8 @@ static void xprt_connect_status(struct rpc_task *task) } switch (task->tk_status) { - case -ENOTCONN: - dprintk("RPC: %5u xprt_connect_status: connection broken\n", - task->tk_pid); + case -EAGAIN: + dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); break; case -ETIMEDOUT: dprintk("RPC: %5u xprt_connect_status: connect attempt timed " @@ -849,15 +848,8 @@ int xprt_prepare_transmit(struct rpc_task *task) err = req->rq_received; goto out_unlock; } - if (!xprt->ops->reserve_xprt(task)) { + if (!xprt->ops->reserve_xprt(task)) err = -EAGAIN; - goto out_unlock; - } - - if (!xprt_connected(xprt)) { - err = -ENOTCONN; - goto out_unlock; - } out_unlock: spin_unlock_bh(&xprt->transport_lock); return err; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 879af6f27b4c..8e58b0b5460b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1162,7 +1162,7 @@ static void xs_tcp_state_change(struct sock *sk) transport->tcp_flags = TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; - xprt_wake_pending_tasks(xprt, 0); + xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock_bh(&xprt->transport_lock); break; @@ -1721,20 +1721,22 @@ static void xs_tcp_connect_worker4(struct work_struct *work) dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - if (status < 0) { - switch (status) { - case -EINPROGRESS: - case -EALREADY: - goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - } + switch (status) { + case 0: + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); } + status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); out_clear: @@ -1780,20 +1782,22 @@ static void xs_tcp_connect_worker6(struct work_struct *work) status = xs_tcp_finish_connecting(xprt, sock); dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - if (status < 0) { - switch (status) { - case -EINPROGRESS: - case -EALREADY: - goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - } + switch (status) { + case 0: + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); } + status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); out_clear: From 8a2cec295f4499cc9d4452e9b02d4ed071bb42d3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:01 -0400 Subject: [PATCH 049/478] SUNRPC: Delay, then retry on connection errors. Enforce the comment in xs_tcp_connect_worker4/xs_tcp_connect_worker6 that we should delay, then retry on certain connection errors. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8e58b0b5460b..9f3e615d3e09 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1722,20 +1722,19 @@ static void xs_tcp_connect_worker4(struct work_struct *work) xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ENETUNREACH: + /* retry with existing socket, after a delay */ case 0: case -EINPROGRESS: case -EALREADY: goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); } + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); @@ -1783,20 +1782,19 @@ static void xs_tcp_connect_worker6(struct work_struct *work) dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ENETUNREACH: + /* retry with existing socket, after a delay */ case 0: case -EINPROGRESS: case -EALREADY: goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); } + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); From 5e3771ce2d6a69e10fcc870cdf226d121d868491 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:01 -0400 Subject: [PATCH 050/478] SUNRPC: Ensure that xs_nospace return values are propagated If xs_nospace() finds that the socket has disconnected, it attempts to return ENOTCONN, however that value is then squashed by the callers. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9f3e615d3e09..2e070679ab4a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -521,11 +521,12 @@ static void xs_nospace_callback(struct rpc_task *task) * @task: task to put to sleep * */ -static void xs_nospace(struct rpc_task *task) +static int xs_nospace(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + int ret = 0; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", task->tk_pid, req->rq_slen - req->rq_bytes_sent, @@ -537,6 +538,7 @@ static void xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { + ret = -EAGAIN; /* * Notify TCP that we're limited by the application * window size @@ -548,10 +550,11 @@ static void xs_nospace(struct rpc_task *task) } } else { clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - task->tk_status = -ENOTCONN; + ret = -ENOTCONN; } spin_unlock_bh(&xprt->transport_lock); + return ret; } /** @@ -603,7 +606,7 @@ static int xs_udp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - xs_nospace(task); + status = xs_nospace(task); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", @@ -706,7 +709,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - xs_nospace(task); + status = xs_nospace(task); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", From d4e3676dba299e24acb66de6da2a0bb44d0d2414 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:12:40 +1030 Subject: [PATCH 051/478] cpumask: remove the now-obsoleted pcibus_to_cpumask(): ia64 Impact: reduce stack usage for large NR_CPUS cpumask_of_pcibus() is the new version. Signed-off-by: Rusty Russell --- arch/ia64/include/asm/topology.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 32f3af1641c5..6fbbf8709e1f 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -117,11 +117,6 @@ void build_cpu_to_node_map(void); extern void arch_fix_phys_package_id(int num, u32 slot); -#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \ - CPU_MASK_ALL : \ - node_to_cpumask(pcibus_to_node(bus)) \ - ) - #define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) From 40fe697a1759b85f5e06c490599f4f7b03de3be7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:12:41 +1030 Subject: [PATCH 052/478] cpumask: arch_send_call_function_ipi_mask: ia64 We're weaning the core code off handing cpumask's around on-stack. This introduces arch_send_call_function_ipi_mask(). We also take the chance to wean send_IPI_mask off the obsolescent for_each_cpu_mask(): making it take the pointer seemed the most natural way. Signed-off-by: Rusty Russell --- arch/ia64/include/asm/smp.h | 3 ++- arch/ia64/kernel/smp.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h index 21c402365d0e..598408336251 100644 --- a/arch/ia64/include/asm/smp.h +++ b/arch/ia64/include/asm/smp.h @@ -126,7 +126,8 @@ extern void identify_siblings (struct cpuinfo_ia64 *); extern int is_multithreading_enabled(void); extern void arch_send_call_function_single_ipi(int cpu); -extern void arch_send_call_function_ipi(cpumask_t mask); +extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); +#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #else /* CONFIG_SMP */ diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index da8f020d82c1..2ea4199d9c57 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -166,11 +166,11 @@ send_IPI_allbutself (int op) * Called with preemption disabled. */ static inline void -send_IPI_mask(cpumask_t mask, int op) +send_IPI_mask(const struct cpumask *mask, int op) { unsigned int cpu; - for_each_cpu_mask(cpu, mask) { + for_each_cpu(cpu, mask) { send_IPI_single(cpu, op); } } @@ -316,7 +316,7 @@ void arch_send_call_function_single_ipi(int cpu) send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE); } -void arch_send_call_function_ipi(cpumask_t mask) +void arch_send_call_function_ipi_mask(const struct cpumask *mask) { send_IPI_mask(mask, IPI_CALL_FUNC); } From 5dd3c9949a3e92ea7fd8c75d888031f7aff1f1d0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:12:42 +1030 Subject: [PATCH 053/478] cpumask: prepare for iterators to only go to nr_cpu_ids/nr_cpumask_bits.: ia64 Impact: cleanup, futureproof In fact, all cpumask ops will only be valid (in general) for bit numbers < nr_cpu_ids. So use that instead of NR_CPUS in various places. This is always safe: no cpu number can be >= nr_cpu_ids, and nr_cpu_ids is initialized to NR_CPUS at boot. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar --- arch/ia64/kernel/acpi.c | 6 +++--- arch/ia64/kernel/mca.c | 6 +++--- arch/ia64/kernel/perfmon.c | 4 ++-- arch/ia64/kernel/salinfo.c | 6 +++--- arch/ia64/kernel/setup.c | 4 ++-- arch/ia64/sn/kernel/setup.c | 2 +- arch/ia64/sn/kernel/sn2/sn2_smp.c | 6 +++--- arch/ia64/sn/kernel/sn2/sn_hwperf.c | 2 +- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index d541671caf4a..c4f41aca107f 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -924,9 +924,9 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu) buffer.length = ACPI_ALLOCATE_BUFFER; buffer.pointer = NULL; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - if (cpu >= NR_CPUS) + cpumask_complement(&tmp_map, cpu_present_mask); + cpu = cpumask_first(&tmp_map); + if (cpu >= nr_cpu_ids) return -EINVAL; acpi_map_cpu2node(handle, cpu, physid); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index bab1de2d2f6a..8f33a8840422 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1456,9 +1456,9 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg) ia64_mca_cmc_int_handler(cmc_irq, arg); - for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); + cpuid = cpumask_next(cpuid+1, cpu_online_mask); - if (cpuid < NR_CPUS) { + if (cpuid < nr_cpu_ids) { platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); } else { /* If no log record, switch out of polling mode */ @@ -1525,7 +1525,7 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg) ia64_mca_cpe_int_handler(cpe_irq, arg); - for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); + cpuid = cpumask_next(cpuid+1, cpu_online_mask); if (cpuid < NR_CPUS) { platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 0e499757309b..6fc1e638f0eb 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -5603,7 +5603,7 @@ pfm_interrupt_handler(int irq, void *arg) * /proc/perfmon interface, for debug only */ -#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1) +#define PFM_PROC_SHOW_HEADER ((void *)nr_cpu_ids+1) static void * pfm_proc_start(struct seq_file *m, loff_t *pos) @@ -5612,7 +5612,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos) return PFM_PROC_SHOW_HEADER; } - while (*pos <= NR_CPUS) { + while (*pos <= nr_cpu_ids) { if (cpu_online(*pos - 1)) { return (void *)*pos; } diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index ecb9eb78d687..7053c55b7649 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -317,7 +317,7 @@ retry: } n = data->cpu_check; - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (cpu_isset(n, data->cpu_event)) { if (!cpu_online(n)) { cpu_clear(n, data->cpu_event); @@ -326,7 +326,7 @@ retry: cpu = n; break; } - if (++n == NR_CPUS) + if (++n == nr_cpu_ids) n = 0; } @@ -337,7 +337,7 @@ retry: /* for next read, start checking at next CPU */ data->cpu_check = cpu; - if (++data->cpu_check == NR_CPUS) + if (++data->cpu_check == nr_cpu_ids) data->cpu_check = 0; snprintf(cmd, sizeof(cmd), "read %d\n", cpu); diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 865af27c7737..ae9ec3dc76b8 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -730,10 +730,10 @@ static void * c_start (struct seq_file *m, loff_t *pos) { #ifdef CONFIG_SMP - while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map)) + while (*pos < nr_cpu_ids && !cpu_online(*pos)) ++*pos; #endif - return *pos < NR_CPUS ? cpu_data(*pos) : NULL; + return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL; } static void * diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index 02c5b8a9fb60..12097776afc0 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -750,7 +750,7 @@ nasid_slice_to_cpuid(int nasid, int slice) { long cpu; - for (cpu = 0; cpu < NR_CPUS; cpu++) + for (cpu = 0; cpu < nr_cpu_ids; cpu++) if (cpuid_to_nasid(cpu) == nasid && cpuid_to_slice(cpu) == slice) return cpu; diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index e585f9a2afb9..209e1eb467da 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -461,7 +461,7 @@ bool sn_cpu_disable_allowed(int cpu) static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset) { - if (*offset < NR_CPUS) + if (*offset < nr_cpu_ids) return offset; return NULL; } @@ -469,7 +469,7 @@ static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset) static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset) { (*offset)++; - if (*offset < NR_CPUS) + if (*offset < nr_cpu_ids) return offset; return NULL; } @@ -491,7 +491,7 @@ static int sn2_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt); } - if (cpu < NR_CPUS && cpu_online(cpu)) { + if (cpu < nr_cpu_ids && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l, stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed, diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c index be339477f906..45f3c2390428 100644 --- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c +++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c @@ -612,7 +612,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) op_info->a->arg &= SN_HWPERF_ARG_OBJID_MASK; if (cpu != SN_HWPERF_ARG_ANY_CPU) { - if (cpu >= NR_CPUS || !cpu_online(cpu)) { + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { r = -EINVAL; goto out; } From 2af51a3f817a22661fcb52da7c96d078a699f40f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:12:43 +1030 Subject: [PATCH 054/478] cpumask: Use accessors code.: ia64 Impact: use new API Use the accessors rather than frobbing bits directly. Most of this is in arch code I haven't even compiled, but is straightforward. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- arch/ia64/kernel/acpi.c | 2 +- arch/ia64/kernel/smpboot.c | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index c4f41aca107f..2f19d91b0b88 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -886,7 +886,7 @@ __init void prefill_possible_map(void) possible, max((possible - available_cpus), 0)); for (i = 0; i < possible; i++) - cpu_set(i, cpu_possible_map); + set_cpu_possible(i, true); } int acpi_map_lsapic(acpi_handle handle, int *pcpu) diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 52290547c85b..7700e23034bb 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -581,14 +581,14 @@ smp_build_cpu_map (void) ia64_cpu_to_sapicid[0] = boot_cpu_id; cpus_clear(cpu_present_map); - cpu_set(0, cpu_present_map); - cpu_set(0, cpu_possible_map); + set_cpu_present(0, true); + set_cpu_possible(0, true); for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { sapicid = smp_boot_data.cpu_phys_id[i]; if (sapicid == boot_cpu_id) continue; - cpu_set(cpu, cpu_present_map); - cpu_set(cpu, cpu_possible_map); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); ia64_cpu_to_sapicid[cpu] = sapicid; cpu++; } @@ -626,12 +626,9 @@ smp_prepare_cpus (unsigned int max_cpus) */ if (!max_cpus) { printk(KERN_INFO "SMP mode deactivated.\n"); - cpus_clear(cpu_online_map); - cpus_clear(cpu_present_map); - cpus_clear(cpu_possible_map); - cpu_set(0, cpu_online_map); - cpu_set(0, cpu_present_map); - cpu_set(0, cpu_possible_map); + init_cpu_online(cpumask_of(0)); + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); return; } } From 5d8c39f68e1dc78c1a958e28bc685a5bac125b21 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:12:48 +1030 Subject: [PATCH 055/478] cpumask: use mm_cpumask() wrapper: ia64 Makes code futureproof against the impending change to mm->cpu_vm_mask. It's also a chance to use the new cpumask_ ops which take a pointer (the older ones are deprecated, but there's no hurry for arch code). Signed-off-by: Rusty Russell --- arch/ia64/include/asm/mmu_context.h | 6 +++--- arch/ia64/mm/tlb.c | 2 +- arch/ia64/sn/kernel/sn2/sn2_smp.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h index 040bc87db930..7f2a456603cb 100644 --- a/arch/ia64/include/asm/mmu_context.h +++ b/arch/ia64/include/asm/mmu_context.h @@ -87,7 +87,7 @@ get_mmu_context (struct mm_struct *mm) /* re-check, now that we've got the lock: */ context = mm->context; if (context == 0) { - cpus_clear(mm->cpu_vm_mask); + cpumask_clear(mm_cpumask(mm)); if (ia64_ctx.next >= ia64_ctx.limit) { ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.max_ctx, ia64_ctx.next); @@ -166,8 +166,8 @@ activate_context (struct mm_struct *mm) do { context = get_mmu_context(mm); - if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) - cpu_set(smp_processor_id(), mm->cpu_vm_mask); + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) + cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); reload_context(context); /* * in the unlikely event of a TLB-flush by another thread, diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index bd9818a36b47..b9f3d7bbb338 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -309,7 +309,7 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, preempt_disable(); #ifdef CONFIG_SMP - if (mm != current->active_mm || cpus_weight(mm->cpu_vm_mask) != 1) { + if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) { platform_global_tlb_purge(mm, start, end, nbits); preempt_enable(); return; diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index 209e1eb467da..3c2f242d90cb 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -133,7 +133,7 @@ sn2_ipi_flush_all_tlb(struct mm_struct *mm) unsigned long itc; itc = ia64_get_itc(); - smp_flush_tlb_cpumask(mm->cpu_vm_mask); + smp_flush_tlb_cpumask(*mm_cpumask(mm)); itc = ia64_get_itc() - itc; __get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc; __get_cpu_var(ptcstats).shub_ipi_flushes++; @@ -182,7 +182,7 @@ sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start, nodes_clear(nodes_flushed); i = 0; - for_each_cpu_mask(cpu, mm->cpu_vm_mask) { + for_each_cpu(cpu, mm_cpumask(mm)) { cnode = cpu_to_node(cpu); node_set(cnode, nodes_flushed); lcpu = cpu; From afd4672dc7610b7feef5190168aa917cc2e417e4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 16 Mar 2009 23:12:23 -0400 Subject: [PATCH 056/478] ext4: Add auto_da_alloc mount option Add a mount option which allows the user to disable automatic allocation of blocks whose allocation by delayed allocation when the file was originally truncated or when the file is renamed over an existing file. This feature is intended to save users from the effects of naive application writers, but it reduces the effectiveness of the delayed allocation code. This mount option disables this safety feature, which may be desirable for prodcutions systems where the risk of unclean shutdowns or unexpected system crashes is low. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 2 +- fs/ext4/namei.c | 3 ++- fs/ext4/super.c | 25 +++++++++++++------------ 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 46aaaa2ed4c5..a004699e7296 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -549,7 +549,7 @@ do { \ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d3118d1acc39..bed4a0abd0d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3908,7 +3908,7 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - if (inode->i_size == 0) + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index eb20246c8965..22098e1cd085 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2497,7 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext4_orphan_add(handle, new_inode); - force_da_alloc = 1; + if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) + force_da_alloc = 1; } retval = 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9aefceb41e7..45d07cf9df34 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -816,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) seq_puts(seq, ",noacl"); #endif - if (!test_opt(sb, RESERVATION)) - seq_puts(seq, ",noreservation"); if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); @@ -868,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); + if (test_opt(sb, NO_AUTO_DA_ALLOC)) + seq_puts(seq, ",auto_da_alloc=0"); + ext4_show_quota_options(seq, sb); return 0; } @@ -1017,7 +1018,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, + Opt_auto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, @@ -1052,8 +1053,6 @@ static const match_table_t tokens = { {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, - {Opt_reservation, "reservation"}, - {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, @@ -1088,6 +1087,7 @@ static const match_table_t tokens = { {Opt_nodelalloc, "nodelalloc"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_err, NULL}, }; @@ -1220,12 +1220,6 @@ static int parse_options(char *options, struct super_block *sb, "not supported\n"); break; #endif - case Opt_reservation: - set_opt(sbi->s_mount_opt, RESERVATION); - break; - case Opt_noreservation: - clear_opt(sbi->s_mount_opt, RESERVATION); - break; case Opt_journal_update: /* @@@ FIXME */ /* Eventually we will want to be able to create @@ -1491,6 +1485,14 @@ set_qf_format: *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, option); break; + case Opt_auto_da_alloc: + if (match_int(&args[0], &option)) + return 0; + if (option) + clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + else + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -2306,7 +2308,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); /* From 7d1e8255cf959fba7ee2317550dfde39f0b936ae Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:03 -0400 Subject: [PATCH 057/478] SUNRPC: Add the equivalent of the linger and linger2 timeouts to RPC sockets This fixes a regression against FreeBSD servers as reported by Tomas Kasparek. Apparently when using RPC over a TCP socket, the FreeBSD servers don't ever react to the client closing the socket, and so commit e06799f958bf7f9f8fae15f0c6f519953fb0257c (SUNRPC: Use shutdown() instead of close() when disconnecting a TCP socket) causes the setup to hang forever whenever the client attempts to close and then reconnect. We break the deadlock by adding a 'linger2' style timeout to the socket, after which, the client will abort the connection using a TCP 'RST'. The default timeout is set to 15 seconds. A subsequent patch will put it under user control by means of a systctl. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprtsock.c | 98 ++++++++++++++++++++++++++++++------- 2 files changed, 82 insertions(+), 17 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 2b0d960603b9..1758d9f5b5c3 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -260,6 +260,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); #define XPRT_BOUND (4) #define XPRT_BINDING (5) #define XPRT_CLOSING (6) +#define XPRT_CONNECTION_ABORT (7) static inline void xprt_set_connected(struct rpc_xprt *xprt) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2e070679ab4a..b51f58b95c39 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -49,6 +49,8 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; +#define XS_TCP_LINGER_TO (15U * HZ) + /* * We can register our own files under /proc/sys/sunrpc by * calling register_sysctl_table() again. The files in that @@ -806,6 +808,7 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_clear_bit(); @@ -1133,6 +1136,47 @@ out: read_unlock(&sk->sk_callback_lock); } +/* + * Do the equivalent of linger/linger2 handling for dealing with + * broken servers that don't close the socket in a timely + * fashion + */ +static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt, + unsigned long timeout) +{ + struct sock_xprt *transport; + + if (xprt_test_and_set_connecting(xprt)) + return; + set_bit(XPRT_CONNECTION_ABORT, &xprt->state); + transport = container_of(xprt, struct sock_xprt, xprt); + queue_delayed_work(rpciod_workqueue, &transport->connect_worker, + timeout); +} + +static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport; + + transport = container_of(xprt, struct sock_xprt, xprt); + + if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) || + !cancel_delayed_work(&transport->connect_worker)) + return; + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + xprt_clear_connecting(xprt); +} + +static void xs_sock_mark_closed(struct rpc_xprt *xprt) +{ + smp_mb__before_clear_bit(); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_clear_bit(); + /* Mark transport as closed and wake up all pending tasks */ + xprt_disconnect_done(xprt); +} + /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1178,6 +1222,7 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_clear_bit(); + xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ @@ -1194,17 +1239,14 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); + xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); smp_mb__before_clear_bit(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_clear_bit(); break; case TCP_CLOSE: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); - /* Mark transport as closed and wake up all pending tasks */ - xprt_disconnect_done(xprt); + xs_tcp_cancel_linger_timeout(xprt); + xs_sock_mark_closed(xprt); } out: read_unlock(&sk->sk_callback_lock); @@ -1562,8 +1604,8 @@ static void xs_udp_connect_worker4(struct work_struct *work) xs_udp_finish_connecting(xprt, sock); status = 0; out: - xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /** @@ -1604,8 +1646,8 @@ static void xs_udp_connect_worker6(struct work_struct *work) xs_udp_finish_connecting(xprt, sock); status = 0; out: - xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /* @@ -1626,7 +1668,9 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo memset(&any, 0, sizeof(any)); any.sa_family = AF_UNSPEC; result = kernel_connect(transport->sock, &any, sizeof(any), 0); - if (result) + if (!result) + xs_sock_mark_closed(xprt); + else dprintk("RPC: AF_UNSPEC connect return code %d\n", result); } @@ -1702,6 +1746,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work) goto out; if (!sock) { + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); /* start from scratch */ if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { dprintk("RPC: can't create TCP transport socket (%d).\n", -err); @@ -1713,10 +1758,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work) sock_release(sock); goto out; } - } else + } else { + int abort_and_exit; + + abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, + &xprt->state); /* "close" the socket, preserving the local port */ xs_tcp_reuse_connection(xprt, transport); + if (abort_and_exit) + goto out_eagain; + } + dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); @@ -1732,17 +1785,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work) case 0: case -EINPROGRESS: case -EALREADY: - goto out_clear; + xprt_clear_connecting(xprt); + return; } /* get rid of existing socket, and retry */ xs_tcp_shutdown(xprt); printk("%s: connect returned unhandled error %d\n", __func__, status); +out_eagain: status = -EAGAIN; out: - xprt_wake_pending_tasks(xprt, status); -out_clear: xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /** @@ -1763,6 +1817,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work) goto out; if (!sock) { + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); /* start from scratch */ if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { dprintk("RPC: can't create TCP transport socket (%d).\n", -err); @@ -1774,10 +1829,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work) sock_release(sock); goto out; } - } else + } else { + int abort_and_exit; + + abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, + &xprt->state); /* "close" the socket, preserving the local port */ xs_tcp_reuse_connection(xprt, transport); + if (abort_and_exit) + goto out_eagain; + } + dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); @@ -1792,17 +1855,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work) case 0: case -EINPROGRESS: case -EALREADY: - goto out_clear; + xprt_clear_connecting(xprt); + return; } /* get rid of existing socket, and retry */ xs_tcp_shutdown(xprt); printk("%s: connect returned unhandled error %d\n", __func__, status); +out_eagain: status = -EAGAIN; out: - xprt_wake_pending_tasks(xprt, status); -out_clear: xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /** From 25fe6142a57c720452c5e9ddbc1f32309c1e5c19 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:03 -0400 Subject: [PATCH 058/478] SUNRPC: Add a sysctl to control the duration of the socket linger timeout Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b51f58b95c39..42222b4dd76d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -50,6 +50,7 @@ unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; #define XS_TCP_LINGER_TO (15U * HZ) +static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; /* * We can register our own files under /proc/sys/sunrpc by @@ -118,6 +119,14 @@ static ctl_table xs_tunables_table[] = { .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, + { + .procname = "tcp_fin_timeout", + .data = &xs_tcp_fin_timeout, + .maxlen = sizeof(xs_tcp_fin_timeout), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = sysctl_jiffies + }, { .ctl_name = 0, }, @@ -1222,7 +1231,7 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_clear_bit(); - xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); + xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ @@ -1239,7 +1248,7 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); - xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); + xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); smp_mb__before_clear_bit(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_clear_bit(); From b61d59fffd3e5b6037c92b4c840605831de8a251 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:04 -0400 Subject: [PATCH 059/478] =?UTF-8?q?SUNRPC:=C2=A0xs=5Ftcp=5Fconnect=5Fworke?= =?UTF-8?q?r{4,6}:=20merge=20common=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 148 ++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 42222b4dd76d..f05a56e597ef 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1738,33 +1738,29 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) } /** - * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint - * @work: RPC transport to connect + * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint + * @xprt: RPC transport to connect + * @transport: socket transport to connect + * @create_sock: function to create a socket of the correct type * * Invoked by a work queue tasklet. */ -static void xs_tcp_connect_worker4(struct work_struct *work) +static void xs_tcp_setup_socket(struct rpc_xprt *xprt, + struct sock_xprt *transport, + struct socket *(*create_sock)(struct rpc_xprt *, + struct sock_xprt *)) { - struct sock_xprt *transport = - container_of(work, struct sock_xprt, connect_worker.work); - struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; - int err, status = -EIO; + int status = -EIO; if (xprt->shutdown) goto out; if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - /* start from scratch */ - if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", -err); - goto out; - } - xs_reclassify_socket4(sock); - - if (xs_bind4(transport, sock) < 0) { - sock_release(sock); + sock = create_sock(xprt, transport); + if (IS_ERR(sock)) { + status = PTR_ERR(sock); goto out; } } else { @@ -1808,6 +1804,69 @@ out: xprt_wake_pending_tasks(xprt, status); } +static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt, + struct sock_xprt *transport) +{ + struct socket *sock; + int err; + + /* start from scratch */ + err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", + -err); + goto out_err; + } + xs_reclassify_socket4(sock); + + if (xs_bind4(transport, sock) < 0) { + sock_release(sock); + goto out_err; + } + return sock; +out_err: + return ERR_PTR(-EIO); +} + +/** + * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint + * @work: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_tcp_connect_worker4(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; + + xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4); +} + +static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt, + struct sock_xprt *transport) +{ + struct socket *sock; + int err; + + /* start from scratch */ + err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", + -err); + goto out_err; + } + xs_reclassify_socket6(sock); + + if (xs_bind6(transport, sock) < 0) { + sock_release(sock); + goto out_err; + } + return sock; +out_err: + return ERR_PTR(-EIO); +} + /** * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint * @work: RPC transport to connect @@ -1819,63 +1878,8 @@ static void xs_tcp_connect_worker6(struct work_struct *work) struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; - struct socket *sock = transport->sock; - int err, status = -EIO; - if (xprt->shutdown) - goto out; - - if (!sock) { - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - /* start from scratch */ - if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", -err); - goto out; - } - xs_reclassify_socket6(sock); - - if (xs_bind6(transport, sock) < 0) { - sock_release(sock); - goto out; - } - } else { - int abort_and_exit; - - abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, - &xprt->state); - /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt, transport); - - if (abort_and_exit) - goto out_eagain; - } - - dprintk("RPC: worker connecting xprt %p to address: %s\n", - xprt, xprt->address_strings[RPC_DISPLAY_ALL]); - - status = xs_tcp_finish_connecting(xprt, sock); - dprintk("RPC: %p connect status %d connected %d sock state %d\n", - xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - switch (status) { - case -ECONNREFUSED: - case -ECONNRESET: - case -ENETUNREACH: - /* retry with existing socket, after a delay */ - case 0: - case -EINPROGRESS: - case -EALREADY: - xprt_clear_connecting(xprt); - return; - } - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); -out_eagain: - status = -EAGAIN; -out: - xprt_clear_connecting(xprt); - xprt_wake_pending_tasks(xprt, status); + xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6); } /** From 55420c24a0d4d1fce70ca713f84aa00b6b74a70e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 15:29:24 -0400 Subject: [PATCH 060/478] SUNRPC: Ensure we close the socket on EPIPE errors too... As long as one task is holding the socket lock, then calls to xprt_force_disconnect(xprt) will not succeed in shutting down the socket. In particular, this would mean that a server initiated shutdown will not succeed until the lock is relinquished. In order to avoid the deadlock, we should ensure that xs_tcp_send_request() closes the socket on EPIPE errors too. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f05a56e597ef..fbc8725c20cb 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -726,10 +726,10 @@ static int xs_tcp_send_request(struct rpc_task *task) dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); case -ECONNRESET: + case -EPIPE: xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: - case -EPIPE: clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); } out: From 2e3c230bc7149a6af65d26a0c312e230e0c33cc3 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 12 Mar 2009 22:21:21 -0400 Subject: [PATCH 061/478] SVCRDMA: fix recent printk format warnings. printk formats in prior commit were reversed/incorrect. Compiled without warning on x86 and x86_64, but detected on ppc. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d0bea987d80e..6c26a675435a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -235,7 +235,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, } dprintk("svcrdma: map_xdr: sge_no %d page_no %d " - "page_base %zd page_len %zd head_len %d tail_len %d\n", + "page_base %u page_len %u head_len %zu tail_len %zu\n", sge_no, page_no, xdr->page_base, xdr->page_len, xdr->head[0].iov_len, xdr->tail[0].iov_len); From 47c62564200609b6de60f535f61f0c73dd10c7c9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 16 Mar 2009 08:13:41 -0400 Subject: [PATCH 062/478] NFS: Fix up a mismerged patch Move the definition of nfs_need_commit() into the #ifdef CONFIG_NFS_V3 section as originally intended in the patch "NFS: cleanup - remove struct nfs_inode->ncommit" Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 36fd35e0de83..e560a78995a3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -537,13 +537,13 @@ static void nfs_cancel_commit_list(struct list_head *head) } } +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static int nfs_need_commit(struct nfs_inode *nfsi) { return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) /* * nfs_scan_commit - Scan an inode for commit requests * @inode: NFS inode to scan From b1e4adf4ea41bb8b5a7bfc1a7001f137e65495df Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Mar 2009 15:35:49 -0400 Subject: [PATCH 063/478] NFS: Fix the notifications when renaming onto an existing file NFS appears to be returning an unnecessary "delete" notification when we're doing an atomic rename. See http://bugzilla.gnome.org/show_bug.cgi?id=575684 The fix is to get rid of the redundant call to d_delete(). Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 672368f865ca..3b2f6973e7c5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (atomic_read(&new_dentry->d_count) > 1) /* dentry still busy? */ goto out; - } else - nfs_drop_nlink(new_inode); + } go_ahead: /* @@ -1638,10 +1637,8 @@ go_ahead: } nfs_inode_return_delegation(old_inode); - if (new_inode != NULL) { + if (new_inode != NULL) nfs_inode_return_delegation(new_inode); - d_delete(new_dentry); - } error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); @@ -1650,6 +1647,8 @@ out: if (rehash) d_rehash(rehash); if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); d_move(old_dentry, new_dentry); nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir)); From 7fe5c398fc2186ed586db11106a6692d871d0d58 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Mar 2009 15:35:50 -0400 Subject: [PATCH 064/478] NFS: Optimise NFS close() Close-to-open cache consistency rules really only require us to flush out writes on calls to close(), and require us to revalidate attributes on the very last close of the file. Currently we appear to be doing a lot of extra attribute revalidation and cache flushes. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 11 ++--------- fs/nfs/inode.c | 41 +++++++++++++++++++++++++++++------------ fs/nfs/internal.h | 3 +++ fs/nfs/nfs3proc.c | 1 + fs/nfs/nfs4proc.c | 10 ++++++++++ fs/nfs/proc.c | 1 + include/linux/nfs_xdr.h | 1 + 7 files changed, 47 insertions(+), 21 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1eab9c9ad242..d451073c4947 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -137,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp) dentry->d_parent->d_name.name, dentry->d_name.name); - /* Ensure that dirty pages are flushed out with the right creds */ - if (filp->f_mode & FMODE_WRITE) - nfs_wb_all(dentry->d_inode); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); } @@ -231,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id) struct nfs_open_context *ctx = nfs_file_open_context(file); struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; - int status; dprintk("NFS: flush(%s/%s)\n", dentry->d_parent->d_name.name, @@ -241,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; nfs_inc_stats(inode, NFSIOS_VFSFLUSH); - /* Ensure that data+attribute caches are up to date after close() */ - status = nfs_do_fsync(ctx, inode); - if (!status) - nfs_revalidate_inode(NFS_SERVER(inode), inode); - return status; + /* Flush writes to the server and return any errors */ + return nfs_do_fsync(ctx, inode); } static ssize_t diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c40adc5dd609..a834d1d850b7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -541,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) return err; } +/** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context + * @is_sync: is this a synchronous close + * + * always ensure that the attributes are up to date if we're mounted + * with close-to-open semantics + */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode; + struct nfs_server *server; + + if (!(ctx->mode & FMODE_WRITE)) + return; + if (!is_sync) + return; + inode = ctx->path.dentry->d_inode; + if (!list_empty(&NFS_I(inode)->open_files)) + return; + server = NFS_SERVER(inode); + if (server->flags & NFS_MOUNT_NOCTO) + return; + nfs_revalidate_inode(server, inode); +} + static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) { struct nfs_open_context *ctx; @@ -567,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) return ctx; } -static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) +static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { - struct inode *inode; + struct inode *inode = ctx->path.dentry->d_inode; - if (ctx == NULL) - return; - - inode = ctx->path.dentry->d_inode; if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); - if (ctx->state != NULL) { - if (wait) - nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); - else - nfs4_close_state(&ctx->path, ctx->state, ctx->mode); - } + NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) put_rpccred(ctx->cred); path_put(&ctx->path); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a55e69aa52e5..2041f68ff1cc 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; #endif +/* proc.c */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + /* dir.c */ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index c55be7a7679e..b82fe6847f14 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .commit_done = nfs3_commit_done, .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, + .close_context = nfs_close_context, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 95f171e7e05a..97bacccff579 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1572,6 +1572,15 @@ out_drop: return 0; } +void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) +{ + if (ctx->state == NULL) + return; + if (is_sync) + nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); + else + nfs4_close_state(&ctx->path, ctx->state, ctx->mode); +} static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -3776,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .commit_done = nfs4_commit_done, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, + .close_context = nfs4_close_context, }; /* diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 193465210d7c..7be72d90d49d 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .commit_setup = nfs_proc_commit_setup, .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, + .close_context = nfs_close_context, }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 0691b9c188d9..9708e78a4d49 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -868,6 +868,7 @@ struct nfs_rpc_ops { int (*lock)(struct file *, int, struct file_lock *); int (*lock_check_bounds)(const struct file_lock *); void (*clear_acl_cache)(struct inode *); + void (*close_context)(struct nfs_open_context *ctx, int); }; /* From 600914ba524130583fa5acdd00df4aa7aa44b173 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 14 Jan 2009 10:04:25 -0700 Subject: [PATCH 065/478] PCI/x86: make early dump handle multi-function devices The early "dump PCI config space" code skips many multi-function devices. This patch fixes that, so it dumps all devices in PCI domain 0. We should not skip the rest of the functions if CLASS_REVISION is 0xffffffff. Often multi-function devices have gaps in the function ID space, e.g., 1c.0 and 1c.2 exist but 1c.1 doesn't. The CLASS_REVISION of the non-existent 1c.1 function will appear to be 0xffffffff. We should only look at the HEADER_TYPE of function zero. Often the "multi-function" is set in function zero, but not in other functions. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/early.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index f6adf2c6d751..c1a2cd541724 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -96,18 +96,21 @@ void early_dump_pci_devices(void) for (func = 0; func < 8; func++) { u32 class; u8 type; + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); if (class == 0xffffffff) - break; + continue; early_dump_pci_device(bus, slot, func); - /* No multi-function device? */ - type = read_pci_config_byte(bus, slot, func, + if (func == 0) { + type = read_pci_config_byte(bus, slot, + func, PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; + if (!(type & 0x80)) + break; + } } } } From 7bc9e77dcc6edf6ce0b3e4677b1e7f4a05b95b85 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 14 Jan 2009 10:04:30 -0700 Subject: [PATCH 066/478] PCI/x86: format early dump like other PCI output Use %02x:%02x.%d rather than %02x:%02x:%02x so PCI addresses look the same as in other parts of the kernel. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/early.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index c1a2cd541724..aaf26ae58cd5 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -69,11 +69,12 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func) int j; u32 val; - printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func); + printk(KERN_INFO "pci 0000:%02x:%02x.%d config space:", + bus, slot, func); for (i = 0; i < 256; i += 4) { if (!(i & 0x0f)) - printk("\n%04x:",i); + printk("\n %02x:",i); val = read_pci_config(bus, slot, func, i); for (j = 0; j < 4; j++) { @@ -115,4 +116,3 @@ void early_dump_pci_devices(void) } } } - From 1cc0ca26c57221f5aa0f49a311a8fc83413dfe97 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 14 Jan 2009 10:04:36 -0700 Subject: [PATCH 067/478] PCI/x86: document pci=earlydump argument Document the "pci=earlydump" argument. This currently only works on x86. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 54f21a5c262b..c7c441e7930e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1659,6 +1659,8 @@ and is between 256 and 4096 characters. It is defined in the file See also Documentation/blockdev/paride.txt. pci=option[,option...] [PCI] various PCI subsystem options: + earlydump [X86] dump PCI config space before the kernel + changes anything off [X86] don't probe for the PCI bus bios [X86-32] force use of PCI BIOS, don't access the hardware directly. Use this if your machine From e496b617b40f2abf6d49803f56aa1344ce1b9177 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 7 Jan 2009 16:22:37 -0800 Subject: [PATCH 068/478] PCI: __FUNCTION__ is gcc-specific, use __func__ Signed-off-by: Harvey Harrison Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 6d6120007af4..5737b8a9a732 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -657,7 +657,7 @@ static int pci_save_pcie_state(struct pci_dev *dev) save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP); if (!save_state) { - dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__); + dev_err(&dev->dev, "buffer not found in %s\n", __func__); return -ENOMEM; } cap = (u16 *)&save_state->data[0]; @@ -700,7 +700,7 @@ static int pci_save_pcix_state(struct pci_dev *dev) save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); if (!save_state) { - dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__); + dev_err(&dev->dev, "buffer not found in %s\n", __func__); return -ENOMEM; } From 1bf83e558cb29d163f4bc6decbc3800ecf4db195 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Jan 2009 14:38:34 +0100 Subject: [PATCH 069/478] PCI: PCIe portdrv: Use driver data to simplify code PCI Express port driver extension, as defined by struct pcie_port_device_ext in portdrv.h, is allocated and initialized, but never used (it also is never freed). Extend it to hold the PCI Express port type as well as the port interrupt mode, change its name and use it to simplify the code in portdrv_core.c . Additionally, remove the redundant interrupt_mode member of struct pcie_device defined in include/linux/pcieport_if.h . Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv.h | 5 +- drivers/pci/pcie/portdrv_core.c | 95 +++++++++++++-------------------- include/linux/pcieport_if.h | 1 - 3 files changed, 39 insertions(+), 62 deletions(-) diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 2529f3f2ea5a..b0dcbc73415e 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -28,8 +28,9 @@ #define get_descriptor_id(type, service) (((type - 4) << 4) | service) -struct pcie_port_device_ext { - int interrupt_mode; /* [0:INTx | 1:MSI | 2:MSI-X] */ +struct pcie_port_data { + int port_type; /* Type of the port */ + int port_irq_mode; /* [0:INTx | 1:MSI | 2:MSI-X] */ }; extern struct bus_type pcie_port_bus_type; diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 8b3f8c18032f..273e97619bce 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -15,10 +15,9 @@ #include #include +#include "../pci.h" #include "portdrv.h" -extern int pcie_mch_quirk; /* MSI-quirk Indicator */ - /** * release_pcie_device - free PCI Express port service device structure * @dev: Port service device to release @@ -31,28 +30,6 @@ static void release_pcie_device(struct device *dev) kfree(to_pcie_device(dev)); } -static int is_msi_quirked(struct pci_dev *dev) -{ - int port_type, quirk = 0; - u16 reg16; - - pci_read_config_word(dev, - pci_find_capability(dev, PCI_CAP_ID_EXP) + - PCIE_CAPABILITIES_REG, ®16); - port_type = (reg16 >> 4) & PORT_TYPE_MASK; - switch(port_type) { - case PCIE_RC_PORT: - if (pcie_mch_quirk == 1) - quirk = 1; - break; - case PCIE_SW_UPSTREAM_PORT: - case PCIE_SW_DOWNSTREAM_PORT: - default: - break; - } - return quirk; -} - /** * assign_interrupt_mode - choose interrupt mode for PCI Express port services * (INTx, MSI-X, MSI) and set up vectors @@ -64,6 +41,7 @@ static int is_msi_quirked(struct pci_dev *dev) */ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask) { + struct pcie_port_data *port_data = pci_get_drvdata(dev); int i, pos, nvec, status = -EINVAL; int interrupt_mode = PCIE_PORT_INTx_MODE; @@ -75,7 +53,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask) } /* Check MSI quirk */ - if (is_msi_quirked(dev)) + if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk) return interrupt_mode; /* Select MSI-X over MSI if supported */ @@ -132,13 +110,11 @@ static int get_port_device_capability(struct pci_dev *dev) pos + PCIE_SLOT_CAPABILITIES_REG, ®32); if (reg32 & SLOT_HP_CAPABLE_MASK) services |= PCIE_PORT_SERVICE_HP; - } - /* PME Capable - root port capability */ - if (((reg16 >> 4) & PORT_TYPE_MASK) == PCIE_RC_PORT) - services |= PCIE_PORT_SERVICE_PME; - + } + /* AER capable */ if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR)) services |= PCIE_PORT_SERVICE_AER; + /* VC support */ if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VC)) services |= PCIE_PORT_SERVICE_VC; @@ -152,15 +128,15 @@ static int get_port_device_capability(struct pci_dev *dev) * @port_type: Type of the port * @service_type: Type of service to associate with the service device * @irq: Interrupt vector to associate with the service device - * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI) */ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev, - int port_type, int service_type, int irq, int irq_mode) + int service_type, int irq) { + struct pcie_port_data *port_data = pci_get_drvdata(parent); struct device *device; + int port_type = port_data->port_type; dev->port = parent; - dev->interrupt_mode = irq_mode; dev->irq = irq; dev->id.vendor = parent->vendor; dev->id.device = parent->device; @@ -185,10 +161,9 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev, * @port_type: Type of the port * @service_type: Type of service to associate with the service device * @irq: Interrupt vector to associate with the service device - * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI) */ static struct pcie_device* alloc_pcie_device(struct pci_dev *parent, - int port_type, int service_type, int irq, int irq_mode) + int service_type, int irq) { struct pcie_device *device; @@ -196,7 +171,7 @@ static struct pcie_device* alloc_pcie_device(struct pci_dev *parent, if (!device) return NULL; - pcie_device_init(parent, device, port_type, service_type, irq,irq_mode); + pcie_device_init(parent, device, service_type, irq); return device; } @@ -230,39 +205,36 @@ int pcie_port_device_probe(struct pci_dev *dev) */ int pcie_port_device_register(struct pci_dev *dev) { - struct pcie_port_device_ext *p_ext; - int status, type, capabilities, irq_mode, i; + struct pcie_port_data *port_data; + int status, capabilities, irq_mode, i; int vectors[PCIE_PORT_DEVICE_MAXSERVICES]; u16 reg16; - /* Allocate port device extension */ - if (!(p_ext = kmalloc(sizeof(struct pcie_port_device_ext), GFP_KERNEL))) + port_data = kzalloc(sizeof(*port_data), GFP_KERNEL); + if (!port_data) return -ENOMEM; - - pci_set_drvdata(dev, p_ext); + pci_set_drvdata(dev, port_data); /* Get port type */ pci_read_config_word(dev, pci_find_capability(dev, PCI_CAP_ID_EXP) + PCIE_CAPABILITIES_REG, ®16); - type = (reg16 >> 4) & PORT_TYPE_MASK; + port_data->port_type = (reg16 >> 4) & PORT_TYPE_MASK; - /* Now get port services */ capabilities = get_port_device_capability(dev); + /* Root ports are capable of generating PME too */ + if (port_data->port_type == PCIE_RC_PORT) + capabilities |= PCIE_PORT_SERVICE_PME; + irq_mode = assign_interrupt_mode(dev, vectors, capabilities); - p_ext->interrupt_mode = irq_mode; + port_data->port_irq_mode = irq_mode; /* Allocate child services if any */ for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) { struct pcie_device *child; if (capabilities & (1 << i)) { - child = alloc_pcie_device( - dev, /* parent */ - type, /* port type */ - i, /* service type */ - vectors[i], /* irq */ - irq_mode /* interrupt mode */); + child = alloc_pcie_device(dev, i, vectors[i]); if (child) { status = device_register(&child->device); if (status) { @@ -349,25 +321,30 @@ static int remove_iter(struct device *dev, void *data) */ void pcie_port_device_remove(struct pci_dev *dev) { - struct device *device; - unsigned long device_addr; - int interrupt_mode = PCIE_PORT_INTx_MODE; + struct pcie_port_data *port_data = pci_get_drvdata(dev); int status; do { + unsigned long device_addr; + status = device_for_each_child(&dev->dev, &device_addr, remove_iter); if (status) { - device = (struct device*)device_addr; - interrupt_mode = (to_pcie_device(device))->interrupt_mode; + struct device *device = (struct device*)device_addr; put_device(device); device_unregister(device); } } while (status); - /* Switch to INTx by default if MSI enabled */ - if (interrupt_mode == PCIE_PORT_MSIX_MODE) + + switch (port_data->port_irq_mode) { + case PCIE_PORT_MSIX_MODE: pci_disable_msix(dev); - else if (interrupt_mode == PCIE_PORT_MSI_MODE) + break; + case PCIE_PORT_MSI_MODE: pci_disable_msi(dev); + break; + } + + kfree(port_data); } /** diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h index 6cd91e3f9820..194409af1037 100644 --- a/include/linux/pcieport_if.h +++ b/include/linux/pcieport_if.h @@ -36,7 +36,6 @@ struct pcie_port_service_id { struct pcie_device { int irq; /* Service IRQ/MSI/MSI-X Vector */ - int interrupt_mode; /* [0:INTx | 1:MSI | 2:MSI-X] */ struct pcie_port_service_id id; /* Service ID */ struct pci_dev *port; /* Root/Upstream/Downstream Port */ void *priv_data; /* Service Private Data */ From 90e9cd50f7feeddc911325c8a8c1b7e1fccc6599 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Jan 2009 14:39:39 +0100 Subject: [PATCH 070/478] PCI: PCIe portdrv: Aviod using service devices with wrong interrupts The PCI Express port driver should not attempt to register service devices that require the ability to generate interrupts if generating interrupts is not possible. Namely, if the port has no interrupt pin configured and we cannot set up MSI or MSI-X for it, there is no way it can generate interrupts and in such a case the port services that rely on interrupts (PME, PCIe HP, AER) should not be enabled for it. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv_core.c | 41 ++++++++++++++++++++++----------- include/linux/pcieport_if.h | 1 + 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 273e97619bce..265eba033a4a 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -43,7 +43,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask) { struct pcie_port_data *port_data = pci_get_drvdata(dev); int i, pos, nvec, status = -EINVAL; - int interrupt_mode = PCIE_PORT_INTx_MODE; + int interrupt_mode = PCIE_PORT_NO_IRQ; /* Set INTx as default */ for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) { @@ -51,7 +51,9 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask) nvec++; vectors[i] = dev->irq; } - + if (dev->pin) + interrupt_mode = PCIE_PORT_INTx_MODE; + /* Check MSI quirk */ if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk) return interrupt_mode; @@ -141,7 +143,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev, dev->id.vendor = parent->vendor; dev->id.device = parent->device; dev->id.port_type = port_type; - dev->id.service_type = (1 << service_type); + dev->id.service_type = service_type; /* Initialize generic device interface */ device = &dev->device; @@ -232,19 +234,32 @@ int pcie_port_device_register(struct pci_dev *dev) /* Allocate child services if any */ for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) { struct pcie_device *child; + int service = 1 << i; - if (capabilities & (1 << i)) { - child = alloc_pcie_device(dev, i, vectors[i]); - if (child) { - status = device_register(&child->device); - if (status) { - kfree(child); - continue; - } - get_device(&child->device); - } + if (!(capabilities & service)) + continue; + + /* + * Don't use service devices that require interrupts if there is + * no way to generate them. + */ + if (irq_mode == PCIE_PORT_NO_IRQ + && service != PCIE_PORT_SERVICE_VC) + continue; + + child = alloc_pcie_device(dev, service, vectors[i]); + if (!child) + continue; + + status = device_register(&child->device); + if (status) { + kfree(child); + continue; } + + get_device(&child->device); } + return 0; } diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h index 194409af1037..8e1ae1fd92f6 100644 --- a/include/linux/pcieport_if.h +++ b/include/linux/pcieport_if.h @@ -22,6 +22,7 @@ #define PCIE_PORT_SERVICE_VC 8 /* Virtual Channel */ /* Root/Upstream/Downstream Port's Interrupt Mode */ +#define PCIE_PORT_NO_IRQ (-1) #define PCIE_PORT_INTx_MODE 0 #define PCIE_PORT_MSI_MODE 1 #define PCIE_PORT_MSIX_MODE 2 From f118c0c3cff4fed39bde1863f9d59850719645cc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Jan 2009 14:42:01 +0100 Subject: [PATCH 071/478] PCI: PCIe portdrv: Do not enable port device before setting up interrupts The PCI Express port driver calls pci_enable_device() before setting up interrupts, which is wrong, because if there is an interrupt pin configured for the port, pci_enable_device() will likely set up an interrupt link for it. However, this shouldn't be done if either MSI or MSI-X interrupt mode is chosen for the port. The solution is to call pci_enable_device() after setting up interrupts, because in that case the interrupt link won't be set up if MSI or MSI-X are enabled. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv_core.c | 44 +++++++++++++++++++++++---------- drivers/pci/pcie/portdrv_pci.c | 11 +++------ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 265eba033a4a..91ecbc43155f 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -208,7 +208,7 @@ int pcie_port_device_probe(struct pci_dev *dev) int pcie_port_device_register(struct pci_dev *dev) { struct pcie_port_data *port_data; - int status, capabilities, irq_mode, i; + int status, capabilities, irq_mode, i, nr_serv; int vectors[PCIE_PORT_DEVICE_MAXSERVICES]; u16 reg16; @@ -229,22 +229,30 @@ int pcie_port_device_register(struct pci_dev *dev) capabilities |= PCIE_PORT_SERVICE_PME; irq_mode = assign_interrupt_mode(dev, vectors, capabilities); - port_data->port_irq_mode = irq_mode; - - /* Allocate child services if any */ - for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) { - struct pcie_device *child; - int service = 1 << i; - - if (!(capabilities & service)) - continue; - + if (irq_mode == PCIE_PORT_NO_IRQ) { /* * Don't use service devices that require interrupts if there is * no way to generate them. */ - if (irq_mode == PCIE_PORT_NO_IRQ - && service != PCIE_PORT_SERVICE_VC) + if (!(capabilities & PCIE_PORT_SERVICE_VC)) { + status = -ENODEV; + goto Error; + } + capabilities = PCIE_PORT_SERVICE_VC; + } + port_data->port_irq_mode = irq_mode; + + status = pci_enable_device(dev); + if (status) + goto Error; + pci_set_master(dev); + + /* Allocate child services if any */ + for (i = 0, nr_serv = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) { + struct pcie_device *child; + int service = 1 << i; + + if (!(capabilities & service)) continue; child = alloc_pcie_device(dev, service, vectors[i]); @@ -258,9 +266,19 @@ int pcie_port_device_register(struct pci_dev *dev) } get_device(&child->device); + nr_serv++; + } + if (!nr_serv) { + pci_disable_device(dev); + status = -ENODEV; + goto Error; } return 0; + + Error: + kfree(port_data); + return status; } #ifdef CONFIG_PM diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index 5ea566e20b37..d17611f390bd 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -82,18 +82,13 @@ static int __devinit pcie_portdrv_probe (struct pci_dev *dev, if (status) return status; - if (pci_enable_device(dev) < 0) - return -ENODEV; - - pci_set_master(dev); if (!dev->irq && dev->pin) { dev_warn(&dev->dev, "device [%04x:%04x] has invalid IRQ; " "check vendor BIOS\n", dev->vendor, dev->device); } - if (pcie_port_device_register(dev)) { - pci_disable_device(dev); - return -ENOMEM; - } + status = pcie_port_device_register(dev); + if (status) + return status; pcie_portdrv_save_config(dev); From 87d2e2ecf6026efa64b01f7f71802b20da736d35 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Jan 2009 14:43:07 +0100 Subject: [PATCH 072/478] PCI: PCIe portdrv: Remove unnecessary function The function pcie_portdrv_save_config() in portdrv_pci.c is not necessary. Remove it. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv_pci.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index d17611f390bd..94d0e2af9bad 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -32,11 +32,6 @@ MODULE_LICENSE("GPL"); /* global data */ static const char device_name[] = "pcieport-driver"; -static int pcie_portdrv_save_config(struct pci_dev *dev) -{ - return pci_save_state(dev); -} - static int pcie_portdrv_restore_config(struct pci_dev *dev) { int retval; @@ -90,7 +85,7 @@ static int __devinit pcie_portdrv_probe (struct pci_dev *dev, if (status) return status; - pcie_portdrv_save_config(dev); + pci_save_state(dev); return 0; } From 0516c8bcd25293f438573101c439ce25a18916ad Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Jan 2009 14:44:19 +0100 Subject: [PATCH 073/478] PCI: PCIe portdrv: Simplily probe callback of service drivers The second argument of the ->probe() callback in struct pcie_port_service_driver is unnecessary and never used. Remove it. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp_acpi.c | 3 +-- drivers/pci/hotplug/pciehp_core.c | 2 +- drivers/pci/pcie/aer/aerdrv.c | 6 ++---- drivers/pci/pcie/portdrv_core.c | 2 +- include/linux/pcieport_if.h | 3 +-- 5 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c index 438d795f9fe3..ad8835758a17 100644 --- a/drivers/pci/hotplug/pciehp_acpi.c +++ b/drivers/pci/hotplug/pciehp_acpi.c @@ -82,8 +82,7 @@ static int __initdata acpi_slot_detected; static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots); /* Dummy driver for dumplicate name detection */ -static int __init dummy_probe(struct pcie_device *dev, - const struct pcie_port_service_id *id) +static int __init dummy_probe(struct pcie_device *dev) { int pos; u32 slot_cap; diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 681e3912b821..3429b21dbb53 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -401,7 +401,7 @@ static int get_cur_bus_speed(struct hotplug_slot *hotplug_slot, enum pci_bus_spe return 0; } -static int pciehp_probe(struct pcie_device *dev, const struct pcie_port_service_id *id) +static int pciehp_probe(struct pcie_device *dev) { int rc; struct controller *ctrl; diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c index e390707661dd..57c41204c549 100644 --- a/drivers/pci/pcie/aer/aerdrv.c +++ b/drivers/pci/pcie/aer/aerdrv.c @@ -38,8 +38,7 @@ MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); -static int __devinit aer_probe (struct pcie_device *dev, - const struct pcie_port_service_id *id ); +static int __devinit aer_probe (struct pcie_device *dev); static void aer_remove(struct pcie_device *dev); static int aer_suspend(struct pcie_device *dev, pm_message_t state) {return 0;} @@ -207,8 +206,7 @@ static void aer_remove(struct pcie_device *dev) * * Invoked when PCI Express bus loads AER service driver. **/ -static int __devinit aer_probe (struct pcie_device *dev, - const struct pcie_port_service_id *id ) +static int __devinit aer_probe (struct pcie_device *dev) { int status; struct aer_rpc *rpc; diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 91ecbc43155f..682524b0c93a 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -402,7 +402,7 @@ static int pcie_port_probe_service(struct device *dev) return -ENODEV; pciedev = to_pcie_device(dev); - status = driver->probe(pciedev, driver->id_table); + status = driver->probe(pciedev); if (!status) { dev_printk(KERN_DEBUG, dev, "service driver %s loaded\n", driver->name); diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h index 8e1ae1fd92f6..59e90b8a7839 100644 --- a/include/linux/pcieport_if.h +++ b/include/linux/pcieport_if.h @@ -56,8 +56,7 @@ static inline void* get_service_data(struct pcie_device *dev) struct pcie_port_service_driver { const char *name; - int (*probe) (struct pcie_device *dev, - const struct pcie_port_service_id *id); + int (*probe) (struct pcie_device *dev); void (*remove) (struct pcie_device *dev); int (*suspend) (struct pcie_device *dev, pm_message_t state); int (*resume) (struct pcie_device *dev); From 22106368c999246c414610dcaacd485e741605b1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Jan 2009 14:46:46 +0100 Subject: [PATCH 074/478] PCI: PCIe portdrv: Remove struct pcie_port_service_id The PCI Express port driver uses 'struct pcie_port_service_id' for matching port service devices and drivers, but this structure contains fields that duplicate information from the port device itself (vendor, device, subvendor, subdevice) and fields that are not used by any existing port service driver (class, class_mask, drvier_data). Also, both existing port service drivers (AER and PCIe HP) don't even use the vendor and device fields for device matching. Therefore 'struct pcie_port_service_id' can be removed altogether and the only useful members of it (port_type, service) can be introduced directly into the port service device and port service driver structures. That simplifies the code quite a bit and reduces its size. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp_acpi.c | 13 ++----------- drivers/pci/hotplug/pciehp_core.c | 12 ++---------- drivers/pci/pcie/aer/aerdrv.c | 16 ++-------------- drivers/pci/pcie/aer/aerdrv_core.c | 10 +++++----- drivers/pci/pcie/portdrv.h | 5 ----- drivers/pci/pcie/portdrv_bus.c | 18 ++++++++++-------- drivers/pci/pcie/portdrv_core.c | 5 +---- include/linux/pcieport_if.h | 17 ++++++++--------- 8 files changed, 30 insertions(+), 66 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c index ad8835758a17..21734c311529 100644 --- a/drivers/pci/hotplug/pciehp_acpi.c +++ b/drivers/pci/hotplug/pciehp_acpi.c @@ -67,16 +67,6 @@ static int __init parse_detect_mode(void) return PCIEHP_DETECT_DEFAULT; } -static struct pcie_port_service_id __initdata port_pci_ids[] = { - { - .vendor = PCI_ANY_ID, - .device = PCI_ANY_ID, - .port_type = PCIE_ANY_PORT, - .service_type = PCIE_PORT_SERVICE_HP, - .driver_data = 0, - }, { /* end: all zeroes */ } -}; - static int __initdata dup_slot_id; static int __initdata acpi_slot_detected; static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots); @@ -110,7 +100,8 @@ static int __init dummy_probe(struct pcie_device *dev) static struct pcie_port_service_driver __initdata dummy_driver = { .name = "pciehp_dummy", - .id_table = port_pci_ids, + .port_type = PCIE_ANY_PORT, + .service = PCIE_PORT_SERVICE_HP, .probe = dummy_probe, }; diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 3429b21dbb53..3d21bbba3308 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -505,18 +505,10 @@ static int pciehp_resume (struct pcie_device *dev) } #endif -static struct pcie_port_service_id port_pci_ids[] = { { - .vendor = PCI_ANY_ID, - .device = PCI_ANY_ID, - .port_type = PCIE_ANY_PORT, - .service_type = PCIE_PORT_SERVICE_HP, - .driver_data = 0, - }, { /* end: all zeroes */ } -}; - static struct pcie_port_service_driver hpdriver_portdrv = { .name = PCIE_MODULE_NAME, - .id_table = &port_pci_ids[0], + .port_type = PCIE_ANY_PORT, + .service = PCIE_PORT_SERVICE_HP, .probe = pciehp_probe, .remove = pciehp_remove, diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c index 57c41204c549..e11c03194063 100644 --- a/drivers/pci/pcie/aer/aerdrv.c +++ b/drivers/pci/pcie/aer/aerdrv.c @@ -48,19 +48,6 @@ static pci_ers_result_t aer_error_detected(struct pci_dev *dev, static void aer_error_resume(struct pci_dev *dev); static pci_ers_result_t aer_root_reset(struct pci_dev *dev); -/* - * PCI Express bus's AER Root service driver data structure - */ -static struct pcie_port_service_id aer_id[] = { - { - .vendor = PCI_ANY_ID, - .device = PCI_ANY_ID, - .port_type = PCIE_RC_PORT, - .service_type = PCIE_PORT_SERVICE_AER, - }, - { /* end: all zeroes */ } -}; - static struct pci_error_handlers aer_error_handlers = { .error_detected = aer_error_detected, .resume = aer_error_resume, @@ -68,7 +55,8 @@ static struct pci_error_handlers aer_error_handlers = { static struct pcie_port_service_driver aerdriver = { .name = "aer", - .id_table = &aer_id[0], + .port_type = PCIE_ANY_PORT, + .service = PCIE_PORT_SERVICE_AER, .probe = aer_probe, .remove = aer_remove, diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index 382575007382..307452f30035 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c @@ -351,21 +351,21 @@ static int find_aer_service_iter(struct device *device, void *data) { struct device_driver *driver; struct pcie_port_service_driver *service_driver; - struct pcie_device *pcie_dev; struct find_aer_service_data *result; result = (struct find_aer_service_data *) data; if (device->bus == &pcie_port_bus_type) { - pcie_dev = to_pcie_device(device); - if (pcie_dev->id.port_type == PCIE_SW_DOWNSTREAM_PORT) + struct pcie_port_data *port_data; + + port_data = pci_get_drvdata(to_pcie_device(device)->port); + if (port_data->port_type == PCIE_SW_DOWNSTREAM_PORT) result->is_downstream = 1; driver = device->driver; if (driver) { service_driver = to_service_driver(driver); - if (service_driver->id_table->service_type == - PCIE_PORT_SERVICE_AER) { + if (service_driver->service == PCIE_PORT_SERVICE_AER) { result->aer_driver = service_driver; return 1; } diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index b0dcbc73415e..ad4d082a0344 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -28,11 +28,6 @@ #define get_descriptor_id(type, service) (((type - 4) << 4) | service) -struct pcie_port_data { - int port_type; /* Type of the port */ - int port_irq_mode; /* [0:INTx | 1:MSI | 2:MSI-X] */ -}; - extern struct bus_type pcie_port_bus_type; extern int pcie_port_device_probe(struct pci_dev *dev); extern int pcie_port_device_register(struct pci_dev *dev); diff --git a/drivers/pci/pcie/portdrv_bus.c b/drivers/pci/pcie/portdrv_bus.c index eec89b767f9f..ef3a4eeaebb4 100644 --- a/drivers/pci/pcie/portdrv_bus.c +++ b/drivers/pci/pcie/portdrv_bus.c @@ -26,20 +26,22 @@ EXPORT_SYMBOL_GPL(pcie_port_bus_type); static int pcie_port_bus_match(struct device *dev, struct device_driver *drv) { struct pcie_device *pciedev; + struct pcie_port_data *port_data; struct pcie_port_service_driver *driver; if (drv->bus != &pcie_port_bus_type || dev->bus != &pcie_port_bus_type) return 0; - + pciedev = to_pcie_device(dev); driver = to_service_driver(drv); - if ( (driver->id_table->vendor != PCI_ANY_ID && - driver->id_table->vendor != pciedev->id.vendor) || - (driver->id_table->device != PCI_ANY_ID && - driver->id_table->device != pciedev->id.device) || - (driver->id_table->port_type != PCIE_ANY_PORT && - driver->id_table->port_type != pciedev->id.port_type) || - driver->id_table->service_type != pciedev->id.service_type ) + + if (driver->service != pciedev->service) + return 0; + + port_data = pci_get_drvdata(pciedev->port); + + if (driver->port_type != PCIE_ANY_PORT + && driver->port_type != port_data->port_type) return 0; return 1; diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 682524b0c93a..843d9e30dd3b 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -140,10 +140,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev, dev->port = parent; dev->irq = irq; - dev->id.vendor = parent->vendor; - dev->id.device = parent->device; - dev->id.port_type = port_type; - dev->id.service_type = service_type; + dev->service = service_type; /* Initialize generic device interface */ device = &dev->device; diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h index 59e90b8a7839..a3832079508e 100644 --- a/include/linux/pcieport_if.h +++ b/include/linux/pcieport_if.h @@ -27,18 +27,15 @@ #define PCIE_PORT_MSI_MODE 1 #define PCIE_PORT_MSIX_MODE 2 -struct pcie_port_service_id { - __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ - __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */ - __u32 class, class_mask; /* (class,subclass,prog-if) triplet */ - __u32 port_type, service_type; /* Port Entity */ - kernel_ulong_t driver_data; +struct pcie_port_data { + int port_type; /* Type of the port */ + int port_irq_mode; /* [0:INTx | 1:MSI | 2:MSI-X] */ }; struct pcie_device { int irq; /* Service IRQ/MSI/MSI-X Vector */ - struct pcie_port_service_id id; /* Service ID */ - struct pci_dev *port; /* Root/Upstream/Downstream Port */ + struct pci_dev *port; /* Root/Upstream/Downstream Port */ + u32 service; /* Port service this device represents */ void *priv_data; /* Service Private Data */ struct device device; /* Generic Device Interface */ }; @@ -67,7 +64,9 @@ struct pcie_port_service_driver { /* Link Reset Capability - AER service driver specific */ pci_ers_result_t (*reset_link) (struct pci_dev *dev); - const struct pcie_port_service_id *id_table; + int port_type; /* Type of the port this driver can handle */ + u32 service; /* Port service this device represents */ + struct device_driver driver; }; #define to_service_driver(d) \ From a447b772826fde2a3abfd9bb943dee8750994c55 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Sun, 25 Jan 2009 23:53:56 +0100 Subject: [PATCH 075/478] PCI: struct device - replace bus_id with dev_name(), dev_set_name() More dev_set_name conversion. Acked-by: Greg Kroah-Hartman Signed-off-by: Kay Sievers Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/cpqphp_sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/cpqphp_sysfs.c b/drivers/pci/hotplug/cpqphp_sysfs.c index a13abf55d784..8450f4a6568a 100644 --- a/drivers/pci/hotplug/cpqphp_sysfs.c +++ b/drivers/pci/hotplug/cpqphp_sysfs.c @@ -225,7 +225,8 @@ void cpqhp_shutdown_debugfs(void) void cpqhp_create_debugfs_files(struct controller *ctrl) { - ctrl->dentry = debugfs_create_file(ctrl->pci_dev->dev.bus_id, S_IRUGO, root, ctrl, &debug_ops); + ctrl->dentry = debugfs_create_file(dev_name(&ctrl->pci_dev->dev), + S_IRUGO, root, ctrl, &debug_ops); } void cpqhp_remove_debugfs_files(struct controller *ctrl) From a52e2e3513d4beafe8fe8699f1519b021c2d05ba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 24 Jan 2009 00:21:14 +0100 Subject: [PATCH 076/478] PCI/MSI: Introduce pci_msix_table_size() Introduce new function pci_msix_table_size() returning the size of the MSI-X table of given PCI device or 0 if the device doesn't support MSI-X. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hidetoshi Seto Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 24 +++++++++++++++++++----- include/linux/pci.h | 5 +++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index baba2eb5367d..08aedd5875b0 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -674,6 +674,23 @@ static int msi_free_irqs(struct pci_dev* dev) return 0; } +/** + * pci_msix_table_size - return the number of device's MSI-X table entries + * @dev: pointer to the pci_dev data structure of MSI-X device function + */ +int pci_msix_table_size(struct pci_dev *dev) +{ + int pos; + u16 control; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (!pos) + return 0; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + return multi_msix_capable(control); +} + /** * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function @@ -691,9 +708,8 @@ static int msi_free_irqs(struct pci_dev* dev) **/ int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) { - int status, pos, nr_entries; + int status, nr_entries; int i, j; - u16 control; if (!entries) return -EINVAL; @@ -702,9 +718,7 @@ int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) if (status) return status; - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - pci_read_config_word(dev, msi_control_reg(pos), &control); - nr_entries = multi_msix_capable(control); + nr_entries = pci_msix_table_size(dev); if (nvec > nr_entries) return -EINVAL; diff --git a/include/linux/pci.h b/include/linux/pci.h index 7bd624bfdcfd..b5d6d0e0f1cb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -799,6 +799,10 @@ static inline void pci_msi_shutdown(struct pci_dev *dev) static inline void pci_disable_msi(struct pci_dev *dev) { } +static inline int pci_msix_table_size(struct pci_dev *dev) +{ + return 0; +} static inline int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) { @@ -823,6 +827,7 @@ static inline int pci_msi_enabled(void) extern int pci_enable_msi(struct pci_dev *dev); extern void pci_msi_shutdown(struct pci_dev *dev); extern void pci_disable_msi(struct pci_dev *dev); +extern int pci_msix_table_size(struct pci_dev *dev); extern int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec); extern void pci_msix_shutdown(struct pci_dev *dev); From b43d451385ef833e0696032aac2629da04d46c59 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 24 Jan 2009 00:23:22 +0100 Subject: [PATCH 077/478] PCI/PCIe portdrv: Fix allocation of interrupts If MSI-X interrupt mode is used by the PCI Express port driver, too many vectors are allocated and it is not ensured that the right vectors will be used for the right services. Namely, the PCI Express specification states that both PCI Express native PME and PCI Express hotplug will always use the same MSI or MSI-X message for signalling interrupts, which implies that the same vector will be used by both of them. Also, the VC service does not use interrupts at all. Moreover, is not clear which of the vectors allocated by pci_enable_msix() in the current code will be used for PME and hotplug and which of them will be used for AER if all of these services are configured. For these reasons, rework the allocation of interrupts for PCI Express ports so that if MSI-X are enabled, the right vectors will be used for the right purposes. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hidetoshi Seto Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv.h | 6 + drivers/pci/pcie/portdrv_core.c | 204 ++++++++++++++++++++++++++------ include/linux/pcieport_if.h | 12 +- 3 files changed, 180 insertions(+), 42 deletions(-) diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index ad4d082a0344..5b818bd835ef 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -25,6 +25,12 @@ #define PCIE_CAPABILITIES_REG 0x2 #define PCIE_SLOT_CAPABILITIES_REG 0x14 #define PCIE_PORT_DEVICE_MAXSERVICES 4 +#define PCIE_PORT_MSI_VECTOR_MASK 0x1f +/* + * According to the PCI Express Base Specification 2.0, the indices of the MSI-X + * table entires used by port services must not exceed 31 + */ +#define PCIE_PORT_MAX_MSIX_ENTRIES 32 #define get_descriptor_id(type, service) (((type - 4) << 4) | service) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 843d9e30dd3b..3aea92a92928 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -30,6 +30,152 @@ static void release_pcie_device(struct device *dev) kfree(to_pcie_device(dev)); } +/** + * pcie_port_msix_add_entry - add entry to given array of MSI-X entries + * @entries: Array of MSI-X entries + * @new_entry: Index of the entry to add to the array + * @nr_entries: Number of entries aleady in the array + * + * Return value: Position of the added entry in the array + */ +static int pcie_port_msix_add_entry( + struct msix_entry *entries, int new_entry, int nr_entries) +{ + int j; + + for (j = 0; j < nr_entries; j++) + if (entries[j].entry == new_entry) + return j; + + entries[j].entry = new_entry; + return j; +} + +/** + * pcie_port_enable_msix - try to set up MSI-X as interrupt mode for given port + * @dev: PCI Express port to handle + * @vectors: Array of interrupt vectors to populate + * @mask: Bitmask of port capabilities returned by get_port_device_capability() + * + * Return value: 0 on success, error code on failure + */ +static int pcie_port_enable_msix(struct pci_dev *dev, int *vectors, int mask) +{ + struct msix_entry *msix_entries; + int idx[PCIE_PORT_DEVICE_MAXSERVICES]; + int nr_entries, status, pos, i, nvec; + u16 reg16; + u32 reg32; + + nr_entries = pci_msix_table_size(dev); + if (!nr_entries) + return -EINVAL; + if (nr_entries > PCIE_PORT_MAX_MSIX_ENTRIES) + nr_entries = PCIE_PORT_MAX_MSIX_ENTRIES; + + msix_entries = kzalloc(sizeof(*msix_entries) * nr_entries, GFP_KERNEL); + if (!msix_entries) + return -ENOMEM; + + /* + * Allocate as many entries as the port wants, so that we can check + * which of them will be useful. Moreover, if nr_entries is correctly + * equal to the number of entries this port actually uses, we'll happily + * go through without any tricks. + */ + for (i = 0; i < nr_entries; i++) + msix_entries[i].entry = i; + + status = pci_enable_msix(dev, msix_entries, nr_entries); + if (status) + goto Exit; + + for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) + idx[i] = -1; + status = -EIO; + nvec = 0; + + if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) { + int entry; + + /* + * The code below follows the PCI Express Base Specification 2.0 + * stating in Section 6.1.6 that "PME and Hot-Plug Event + * interrupts (when both are implemented) always share the same + * MSI or MSI-X vector, as indicated by the Interrupt Message + * Number field in the PCI Express Capabilities register", where + * according to Section 7.8.2 of the specification "For MSI-X, + * the value in this field indicates which MSI-X Table entry is + * used to generate the interrupt message." + */ + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); + pci_read_config_word(dev, pos + PCIE_CAPABILITIES_REG, ®16); + entry = (reg16 >> 9) & PCIE_PORT_MSI_VECTOR_MASK; + if (entry >= nr_entries) + goto Error; + + i = pcie_port_msix_add_entry(msix_entries, entry, nvec); + if (i == nvec) + nvec++; + + idx[PCIE_PORT_SERVICE_PME_SHIFT] = i; + idx[PCIE_PORT_SERVICE_HP_SHIFT] = i; + } + + if (mask & PCIE_PORT_SERVICE_AER) { + int entry; + + /* + * The code below follows Section 7.10.10 of the PCI Express + * Base Specification 2.0 stating that bits 31-27 of the Root + * Error Status Register contain a value indicating which of the + * MSI/MSI-X vectors assigned to the port is going to be used + * for AER, where "For MSI-X, the value in this register + * indicates which MSI-X Table entry is used to generate the + * interrupt message." + */ + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, ®32); + entry = reg32 >> 27; + if (entry >= nr_entries) + goto Error; + + i = pcie_port_msix_add_entry(msix_entries, entry, nvec); + if (i == nvec) + nvec++; + + idx[PCIE_PORT_SERVICE_AER_SHIFT] = i; + } + + /* + * If nvec is equal to the allocated number of entries, we can just use + * what we have. Otherwise, the port has some extra entries not for the + * services we know and we need to work around that. + */ + if (nvec == nr_entries) { + status = 0; + } else { + /* Drop the temporary MSI-X setup */ + pci_disable_msix(dev); + + /* Now allocate the MSI-X vectors for real */ + status = pci_enable_msix(dev, msix_entries, nvec); + if (status) + goto Exit; + } + + for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) + vectors[i] = idx[i] >= 0 ? msix_entries[idx[i]].vector : -1; + + Exit: + kfree(msix_entries); + return status; + + Error: + pci_disable_msix(dev); + goto Exit; +} + /** * assign_interrupt_mode - choose interrupt mode for PCI Express port services * (INTx, MSI-X, MSI) and set up vectors @@ -42,49 +188,31 @@ static void release_pcie_device(struct device *dev) static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask) { struct pcie_port_data *port_data = pci_get_drvdata(dev); - int i, pos, nvec, status = -EINVAL; - int interrupt_mode = PCIE_PORT_NO_IRQ; - - /* Set INTx as default */ - for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) { - if (mask & (1 << i)) - nvec++; - vectors[i] = dev->irq; - } - if (dev->pin) - interrupt_mode = PCIE_PORT_INTx_MODE; + int irq, interrupt_mode = PCIE_PORT_NO_IRQ; + int i; /* Check MSI quirk */ if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk) - return interrupt_mode; + goto Fallback; - /* Select MSI-X over MSI if supported */ - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (pos) { - struct msix_entry msix_entries[PCIE_PORT_DEVICE_MAXSERVICES] = - {{0, 0}, {0, 1}, {0, 2}, {0, 3}}; - status = pci_enable_msix(dev, msix_entries, nvec); - if (!status) { - int j = 0; + /* Try to use MSI-X if supported */ + if (!pcie_port_enable_msix(dev, vectors, mask)) + return PCIE_PORT_MSIX_MODE; + + /* We're not going to use MSI-X, so try MSI and fall back to INTx */ + if (!pci_enable_msi(dev)) + interrupt_mode = PCIE_PORT_MSI_MODE; + + Fallback: + if (interrupt_mode == PCIE_PORT_NO_IRQ && dev->pin) + interrupt_mode = PCIE_PORT_INTx_MODE; + + irq = interrupt_mode != PCIE_PORT_NO_IRQ ? dev->irq : -1; + for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) + vectors[i] = irq; + + vectors[PCIE_PORT_SERVICE_VC_SHIFT] = -1; - interrupt_mode = PCIE_PORT_MSIX_MODE; - for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) { - if (mask & (1 << i)) - vectors[i] = msix_entries[j++].vector; - } - } - } - if (status) { - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); - if (pos) { - status = pci_enable_msi(dev); - if (!status) { - interrupt_mode = PCIE_PORT_MSI_MODE; - for (i = 0;i < PCIE_PORT_DEVICE_MAXSERVICES;i++) - vectors[i] = dev->irq; - } - } - } return interrupt_mode; } diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h index a3832079508e..5d2afcfa6bc1 100644 --- a/include/linux/pcieport_if.h +++ b/include/linux/pcieport_if.h @@ -16,10 +16,14 @@ #define PCIE_ANY_PORT 7 /* Service Type */ -#define PCIE_PORT_SERVICE_PME 1 /* Power Management Event */ -#define PCIE_PORT_SERVICE_AER 2 /* Advanced Error Reporting */ -#define PCIE_PORT_SERVICE_HP 4 /* Native Hotplug */ -#define PCIE_PORT_SERVICE_VC 8 /* Virtual Channel */ +#define PCIE_PORT_SERVICE_PME_SHIFT 0 /* Power Management Event */ +#define PCIE_PORT_SERVICE_PME (1 << PCIE_PORT_SERVICE_PME_SHIFT) +#define PCIE_PORT_SERVICE_AER_SHIFT 1 /* Advanced Error Reporting */ +#define PCIE_PORT_SERVICE_AER (1 << PCIE_PORT_SERVICE_AER_SHIFT) +#define PCIE_PORT_SERVICE_HP_SHIFT 2 /* Native Hotplug */ +#define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT) +#define PCIE_PORT_SERVICE_VC_SHIFT 3 /* Virtual Channel */ +#define PCIE_PORT_SERVICE_VC (1 << PCIE_PORT_SERVICE_VC_SHIFT) /* Root/Upstream/Downstream Port's Interrupt Mode */ #define PCIE_PORT_NO_IRQ (-1) From 11df1f05514beaf0269484191007dbc8d47e0e6f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 19 Jan 2009 11:31:00 +1100 Subject: [PATCH 078/478] PCI/MSI: Use #ifdefs instead of weak functions Weak functions aren't all they're cracked up to be. They lead to incorrect binaries with some toolchains, they require us to have empty functions we otherwise wouldn't, and the unused code is not elided (as of gcc 4.3.2 anyway). So replace the weak MSI arch hooks with the #define foo foo idiom. We no longer need empty versions of arch_setup/teardown_msi_irq(). This is less source (by 1 line!), and results in smaller binaries too: text data bss dec hex filename 9354300 1693916 678424 11726640 b2ef30 build/powerpc/vmlinux-before 9354052 1693852 678424 11726328 b2edf8 build/powerpc/vmlinux-after Also smaller on x86_64 and arm (iop13xx). Signed-off-by: Michael Ellerman Signed-off-by: Jesse Barnes --- arch/powerpc/include/asm/pci.h | 4 ++++ arch/x86/include/asm/pci.h | 3 +++ drivers/pci/msi.c | 26 +++++++++----------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 3548159a1beb..ba17d5d90a49 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -114,6 +114,10 @@ extern int pci_domain_nr(struct pci_bus *bus); /* Decide whether to display the domain number in /proc */ extern int pci_proc_domain(struct pci_bus *bus); +/* MSI arch hooks */ +#define arch_setup_msi_irqs arch_setup_msi_irqs +#define arch_teardown_msi_irqs arch_teardown_msi_irqs +#define arch_msi_check_device arch_msi_check_device struct vm_area_struct; /* Map a range of PCI memory or I/O space for a device into user space */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index a977de23cb4d..a0301bfeb954 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -86,6 +86,9 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); +/* MSI arch hook */ +#define arch_setup_msi_irqs arch_setup_msi_irqs + #endif /* __KERNEL__ */ #ifdef CONFIG_X86_32 diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 08aedd5875b0..33adf323f064 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -27,20 +27,15 @@ static int pci_msi_enable = 1; /* Arch hooks */ -int __attribute__ ((weak)) -arch_msi_check_device(struct pci_dev *dev, int nvec, int type) +#ifndef arch_msi_check_device +int arch_msi_check_device(struct pci_dev *dev, int nvec, int type) { return 0; } +#endif -int __attribute__ ((weak)) -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry) -{ - return 0; -} - -int __attribute__ ((weak)) -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +#ifndef arch_setup_msi_irqs +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { struct msi_desc *entry; int ret; @@ -53,14 +48,10 @@ arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; } +#endif -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq) -{ - return; -} - -void __attribute__ ((weak)) -arch_teardown_msi_irqs(struct pci_dev *dev) +#ifndef arch_teardown_msi_irqs +void arch_teardown_msi_irqs(struct pci_dev *dev) { struct msi_desc *entry; @@ -69,6 +60,7 @@ arch_teardown_msi_irqs(struct pci_dev *dev) arch_teardown_msi_irq(entry->irq); } } +#endif static void __msi_set_enable(struct pci_dev *dev, int pos, int enable) { From 2b56313448bb8efad3af19f211d988c8352ac04d Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 28 Jan 2009 18:27:21 +0800 Subject: [PATCH 079/478] PCI: check if a bus is added when removing it When removing a bus, 'is_added' should be checked to make sure the bus has been successfully added by pci_bus_add_child() who will sets 'is_added'. Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/remove.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 042e08924421..caf8e1eae45e 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -71,6 +71,9 @@ void pci_remove_bus(struct pci_bus *pci_bus) down_write(&pci_bus_sem); list_del(&pci_bus->node); up_write(&pci_bus_sem); + if (!pci_bus->is_added) + return; + pci_remove_legacy_files(pci_bus); device_remove_file(&pci_bus->dev, &dev_attr_cpuaffinity); device_remove_file(&pci_bus->dev, &dev_attr_cpulistaffinity); From 1c35b8e538cb6259accb215099cdb673310cad84 Mon Sep 17 00:00:00 2001 From: Frank Seidel Date: Fri, 6 Feb 2009 10:23:36 +0100 Subject: [PATCH 080/478] PCI: add missing KERN_* constants to printks According to kerneljanitors todo list all printk calls (beginning a new line) should have an according KERN_* constant. Those are the missing pieces here for the pci subsystem. Signed-off-by: Frank Seidel Reviewed-by: Kenji Kaneshige Tested-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp.h | 10 +++++----- drivers/pci/hotplug/shpchp.h | 10 +++++----- drivers/pci/intel-iommu.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 39ae37589fda..1f887f64e493 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -46,10 +46,10 @@ extern int pciehp_force; extern struct workqueue_struct *pciehp_wq; #define dbg(format, arg...) \ - do { \ - if (pciehp_debug) \ - printk("%s: " format, MY_NAME , ## arg); \ - } while (0) +do { \ + if (pciehp_debug) \ + printk(KERN_DEBUG "%s: " format, MY_NAME , ## arg); \ +} while (0) #define err(format, arg...) \ printk(KERN_ERR "%s: " format, MY_NAME , ## arg) #define info(format, arg...) \ @@ -60,7 +60,7 @@ extern struct workqueue_struct *pciehp_wq; #define ctrl_dbg(ctrl, format, arg...) \ do { \ if (pciehp_debug) \ - dev_printk(, &ctrl->pcie->device, \ + dev_printk(KERN_DEBUG, &ctrl->pcie->device, \ format, ## arg); \ } while (0) #define ctrl_err(ctrl, format, arg...) \ diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h index 6aba0b6cf2e0..974e924ca96d 100644 --- a/drivers/pci/hotplug/shpchp.h +++ b/drivers/pci/hotplug/shpchp.h @@ -48,10 +48,10 @@ extern int shpchp_debug; extern struct workqueue_struct *shpchp_wq; #define dbg(format, arg...) \ - do { \ - if (shpchp_debug) \ - printk("%s: " format, MY_NAME , ## arg); \ - } while (0) +do { \ + if (shpchp_debug) \ + printk(KERN_DEBUG "%s: " format, MY_NAME , ## arg); \ +} while (0) #define err(format, arg...) \ printk(KERN_ERR "%s: " format, MY_NAME , ## arg) #define info(format, arg...) \ @@ -62,7 +62,7 @@ extern struct workqueue_struct *shpchp_wq; #define ctrl_dbg(ctrl, format, arg...) \ do { \ if (shpchp_debug) \ - dev_printk(, &ctrl->pci_dev->dev, \ + dev_printk(KERN_DEBUG, &ctrl->pci_dev->dev, \ format, ## arg); \ } while (0) #define ctrl_err(ctrl, format, arg...) \ diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f3f686581a90..b548937d4746 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1970,7 +1970,7 @@ static inline void iommu_prepare_isa(void) ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); if (ret) - printk("IOMMU: Failed to create 0-64M identity map, " + printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, " "floppy might not work\n"); } From 0b3e7388e3b438500aaa0630879ce536747a47ca Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 8 Feb 2009 22:45:24 +0100 Subject: [PATCH 081/478] PCI: introduce missing kfree Error handling code following a kmalloc should free the allocated data. Since the subsequent code that could provoke an error does not use the allocated data, the allocation is just moved below it. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,l; position p1,p2; expression *ptr != NULL; @@ ( if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S | x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S ) <... when != x when != if (...) { <+...x...+> } x->f = E ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Reviewed-by: Matthew Wilcox Reviewed-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp_acpi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c index 21734c311529..96048010e7d9 100644 --- a/drivers/pci/hotplug/pciehp_acpi.c +++ b/drivers/pci/hotplug/pciehp_acpi.c @@ -79,14 +79,15 @@ static int __init dummy_probe(struct pcie_device *dev) struct slot *slot, *tmp; struct pci_dev *pdev = dev->port; struct pci_bus *pbus = pdev->subordinate; - if (!(slot = kzalloc(sizeof(*slot), GFP_KERNEL))) - return -ENOMEM; /* Note: pciehp_detect_mode != PCIEHP_DETECT_ACPI here */ if (pciehp_get_hp_hw_control_from_firmware(pdev)) return -ENODEV; if (!(pos = pci_find_capability(pdev, PCI_CAP_ID_EXP))) return -ENODEV; pci_read_config_dword(pdev, pos + PCI_EXP_SLTCAP, &slot_cap); + slot = kzalloc(sizeof(*slot), GFP_KERNEL); + if (!slot) + return -ENOMEM; slot->number = slot_cap >> 19; list_for_each_entry(tmp, &dummy_slots, slot_list) { if (tmp->number == slot->number) From 81b840cd27e3ee9af67b6e05a4847868f74fce69 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 3 Feb 2009 15:06:13 +0900 Subject: [PATCH 082/478] PCI: pciehp: fix possible endless loop in pcie_isr Fix possible endless loop in pcie_isr. Currently, pcie_isr() (interrupt service routine of pciehp) can end up in an endless loop if the Slot Status register is set again immediately after being cleared. According to the past discussion (see below URL) this case can happen if the power fault detected bit is set during handling. http://sourceforge.net/mailarchive/message.php?msg_id=20051130135409.A14918%40unix-os.sc.intel.com Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp_hpc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 7a16c6897bb9..c1a312f24060 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -672,10 +672,11 @@ static irqreturn_t pcie_isr(int irq, void *dev_id) detected &= (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD | PCI_EXP_SLTSTA_MRLSC | PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC); + detected &= ~intr_loc; intr_loc |= detected; if (!intr_loc) return IRQ_NONE; - if (detected && pciehp_writew(ctrl, PCI_EXP_SLTSTA, detected)) { + if (detected && pciehp_writew(ctrl, PCI_EXP_SLTSTA, intr_loc)) { ctrl_err(ctrl, "%s: Cannot write to SLOTSTATUS\n", __func__); return IRQ_NONE; From 99f0169c17f334a11b0ace91188501c612f3e1e6 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 3 Feb 2009 15:06:16 +0900 Subject: [PATCH 083/478] PCI: pciehp: enable software notification on empty slots Current pciehp disables software notification of adapter presence changed event and MRL changed event when slot is turned off. Because of this, there is no way to detect those events on empty slots in the current pciehp implementation. According to the past discussion(*), this behavior was introduced to prevent endless loop that could happen if pcie_isr() runs after power fault is detected on a certain platform whose stickey power-fault bit remains on till the slot is powered on again. (*) http://sourceforge.net/mailarchive/message.php?msg_id=20051130135409.A14918%40unix-os.sc.intel.com I think this endless loop can be avoided using one bit flag that indicates power fault had been detected, instead of disabling software notification of adapter present changed event and MRL changed event. With this patch, we can enable software notification mechanism of presence changed and MRL changed event on the empty slots again. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp.h | 1 + drivers/pci/hotplug/pciehp_hpc.c | 31 +++++++++++-------------------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 1f887f64e493..2bf8d28062e8 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -112,6 +112,7 @@ struct controller { unsigned int no_cmd_complete:1; unsigned int link_active_reporting:1; unsigned int notification_enabled:1; + unsigned int power_fault_detected; }; #define INT_BUTTON_IGNORE 0 diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index c1a312f24060..07bd32151146 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -548,23 +548,21 @@ static int hpc_power_on_slot(struct slot * slot) slot_cmd = POWER_ON; cmd_mask = PCI_EXP_SLTCTL_PCC; - /* Enable detection that we turned off at slot power-off time */ if (!pciehp_poll_mode) { - slot_cmd |= (PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE | - PCI_EXP_SLTCTL_PDCE); - cmd_mask |= (PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE | - PCI_EXP_SLTCTL_PDCE); + /* Enable power fault detection turned off at power off time */ + slot_cmd |= PCI_EXP_SLTCTL_PFDE; + cmd_mask |= PCI_EXP_SLTCTL_PFDE; } retval = pcie_write_cmd(ctrl, slot_cmd, cmd_mask); - if (retval) { ctrl_err(ctrl, "Write %x command failed!\n", slot_cmd); - return -1; + return retval; } ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, ctrl->cap_base + PCI_EXP_SLTCTL, slot_cmd); + ctrl->power_fault_detected = 0; return retval; } @@ -621,18 +619,10 @@ static int hpc_power_off_slot(struct slot * slot) slot_cmd = POWER_OFF; cmd_mask = PCI_EXP_SLTCTL_PCC; - /* - * If we get MRL or presence detect interrupts now, the isr - * will notice the sticky power-fault bit too and issue power - * indicator change commands. This will lead to an endless loop - * of command completions, since the power-fault bit remains on - * till the slot is powered on again. - */ if (!pciehp_poll_mode) { - slot_cmd &= ~(PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE | - PCI_EXP_SLTCTL_PDCE); - cmd_mask |= (PCI_EXP_SLTCTL_PFDE | PCI_EXP_SLTCTL_MRLSCE | - PCI_EXP_SLTCTL_PDCE); + /* Disable power fault detection */ + slot_cmd &= ~PCI_EXP_SLTCTL_PFDE; + cmd_mask |= PCI_EXP_SLTCTL_PFDE; } retval = pcie_write_cmd(ctrl, slot_cmd, cmd_mask); @@ -710,9 +700,10 @@ static irqreturn_t pcie_isr(int irq, void *dev_id) pciehp_handle_presence_change(p_slot); /* Check Power Fault Detected */ - if (intr_loc & PCI_EXP_SLTSTA_PFD) + if ((intr_loc & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) { + ctrl->power_fault_detected = 1; pciehp_handle_power_fault(p_slot); - + } return IRQ_HANDLED; } From 6a82e21823058eea95325005b79f3b8c9492460f Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 3 Feb 2009 15:06:18 +0900 Subject: [PATCH 084/478] PCI: pciehp: make cmd_busy flag one bit The cmd_busy field in struct controller takes only two values 0 or 1. So it should be one bit. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 2bf8d28062e8..0a368547e633 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -108,7 +108,7 @@ struct controller { u32 slot_cap; u8 cap_base; struct timer_list poll_timer; - int cmd_busy; + unsigned int cmd_busy:1; unsigned int no_cmd_complete:1; unsigned int link_active_reporting:1; unsigned int notification_enabled:1; From 62795041418dd63cd9ff6ff7bbdf1d1c513c189b Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Wed, 4 Feb 2009 11:25:22 -0700 Subject: [PATCH 085/478] PCI: enhance physical slot debug information Convert usages of pr_debug to dev_dbg and add physical slot name. Note that we use dev_dbg on the struct pci_bus and still manually print out the PCI slot number (instead of calling dev_dbg on a pci_dev) because a struct pci_bus with empty physical slots will not have any pci_devs. Reviewed-by: Andrew Patterson Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/slot.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index 5a8ccb4f604d..21189447e545 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -1,8 +1,8 @@ /* * drivers/pci/slot.c * Copyright (C) 2006 Matthew Wilcox - * Copyright (C) 2006-2008 Hewlett-Packard Development Company, L.P. - * Alex Chiang + * Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P. + * Alex Chiang */ #include @@ -52,8 +52,8 @@ static void pci_slot_release(struct kobject *kobj) struct pci_dev *dev; struct pci_slot *slot = to_pci_slot(kobj); - pr_debug("%s: releasing pci_slot on %x:%d\n", __func__, - slot->bus->number, slot->number); + dev_dbg(&slot->bus->dev, "dev %02x, released physical slot %s\n", + slot->number, pci_slot_name(slot)); list_for_each_entry(dev, &slot->bus->devices, bus_list) if (PCI_SLOT(dev->devfn) == slot->number) @@ -248,9 +248,8 @@ placeholder: if (PCI_SLOT(dev->devfn) == slot_nr) dev->slot = slot; - /* Don't care if debug printk has a -1 for slot_nr */ - pr_debug("%s: created pci_slot on %04x:%02x:%02x\n", - __func__, pci_domain_nr(parent), parent->number, slot_nr); + dev_dbg(&parent->dev, "dev %02x, created physical slot %s\n", + slot_nr, pci_slot_name(slot)); out: kfree(slot_name); @@ -299,9 +298,8 @@ EXPORT_SYMBOL_GPL(pci_renumber_slot); */ void pci_destroy_slot(struct pci_slot *slot) { - pr_debug("%s: dec refcount to %d on %04x:%02x:%02x\n", __func__, - atomic_read(&slot->kobj.kref.refcount) - 1, - pci_domain_nr(slot->bus), slot->bus->number, slot->number); + dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n", + slot->number, atomic_read(&slot->kobj.kref.refcount) - 1); down_write(&pci_bus_sem); kobject_put(&slot->kobj); From 4c9c16867e4980fbd7d1fcc9516c9269ecb4d06f Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 8 Dec 2008 16:19:14 +0100 Subject: [PATCH 086/478] PCI quirk: don't mark one netmos as class other Let it stay as serial, since it doesn't have subdevice in the form of 0x00PS. Signed-off-by: Jiri Slaby Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 92b9efe9bcaf..5aa2afb23ef9 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1664,9 +1664,13 @@ static void __devinit quirk_netmos(struct pci_dev *dev) * of parallel ports and is the number of serial ports. */ switch (dev->device) { + case PCI_DEVICE_ID_NETMOS_9835: + /* Well, this rule doesn't hold for the following 9835 device */ + if (dev->subsystem_vendor == PCI_VENDOR_ID_IBM && + dev->subsystem_device == 0x0299) + return; case PCI_DEVICE_ID_NETMOS_9735: case PCI_DEVICE_ID_NETMOS_9745: - case PCI_DEVICE_ID_NETMOS_9835: case PCI_DEVICE_ID_NETMOS_9845: case PCI_DEVICE_ID_NETMOS_9855: if ((dev->class >> 8) == PCI_CLASS_COMMUNICATION_SERIAL && From 5fe5db05f64d0d10b563b1c13b58e4a52b190686 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 9 Feb 2009 14:53:47 +0800 Subject: [PATCH 087/478] PCI: Speed up device reset function For all devices need to do function level reset, currently we need wait for at least 200ms, which can be too long if we have lots of devices... The patch checked pending bit before msleep() to skip some unnecessary sleeping interval. Signed-off-by: Sheng Yang Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 50 +++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 5737b8a9a732..0b3e20f1b6f7 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2028,18 +2028,24 @@ static int __pcie_flr(struct pci_dev *dev, int probe) pci_block_user_cfg_access(dev); /* Wait for Transaction Pending bit clean */ + pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); + if (!(status & PCI_EXP_DEVSTA_TRPND)) + goto transaction_done; + msleep(100); pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); - if (status & PCI_EXP_DEVSTA_TRPND) { - dev_info(&dev->dev, "Busy after 100ms while trying to reset; " - "sleeping for 1 second\n"); - ssleep(1); - pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); - if (status & PCI_EXP_DEVSTA_TRPND) - dev_info(&dev->dev, "Still busy after 1s; " - "proceeding with reset anyway\n"); - } + if (!(status & PCI_EXP_DEVSTA_TRPND)) + goto transaction_done; + dev_info(&dev->dev, "Busy after 100ms while trying to reset; " + "sleeping for 1 second\n"); + ssleep(1); + pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); + if (status & PCI_EXP_DEVSTA_TRPND) + dev_info(&dev->dev, "Still busy after 1s; " + "proceeding with reset anyway\n"); + +transaction_done: pci_write_config_word(dev, exppos + PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR); mdelay(100); @@ -2066,18 +2072,24 @@ static int __pci_af_flr(struct pci_dev *dev, int probe) pci_block_user_cfg_access(dev); /* Wait for Transaction Pending bit clean */ + pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status); + if (!(status & PCI_AF_STATUS_TP)) + goto transaction_done; + msleep(100); pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status); - if (status & PCI_AF_STATUS_TP) { - dev_info(&dev->dev, "Busy after 100ms while trying to" - " reset; sleeping for 1 second\n"); - ssleep(1); - pci_read_config_byte(dev, - cappos + PCI_AF_STATUS, &status); - if (status & PCI_AF_STATUS_TP) - dev_info(&dev->dev, "Still busy after 1s; " - "proceeding with reset anyway\n"); - } + if (!(status & PCI_AF_STATUS_TP)) + goto transaction_done; + + dev_info(&dev->dev, "Busy after 100ms while trying to" + " reset; sleeping for 1 second\n"); + ssleep(1); + pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status); + if (status & PCI_AF_STATUS_TP) + dev_info(&dev->dev, "Still busy after 1s; " + "proceeding with reset anyway\n"); + +transaction_done: pci_write_config_byte(dev, cappos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); mdelay(100); From 63f10f0f6df4e4e860b790d64bebfde85b540b0a Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Mon, 9 Feb 2009 15:59:29 +0900 Subject: [PATCH 088/478] PCI/ACPI: move _OSC code to pci_root.c Move PCI _OSC management code from drivers/pci/pci-acpi.c to drivers/acpi/pci_root.c. The benefits are - We no longer need struct osc_data and its management code (contents are moved to struct acpi_pci_root). This simplify the code, and we no longer care about kmalloc() failure. - We can make pci_acpi_osc_support() be a static function, which is called only from drivers/acpi/pci_root.c. Signed-off-by: Kenji Kaneshige Reviewed-by: Andrew Patterson Tested-by: Andrew Patterson Acked-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/acpi/pci_root.c | 180 +++++++++++++++++++++++++++++++- drivers/pci/pci-acpi.c | 215 --------------------------------------- include/linux/pci-acpi.h | 1 - 3 files changed, 178 insertions(+), 218 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 5b38a026d122..979eccc82c5b 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -66,11 +66,18 @@ struct acpi_pci_root { struct acpi_device * device; struct acpi_pci_id id; struct pci_bus *bus; + + u32 osc_support_set; /* _OSC state of support bits */ + u32 osc_control_set; /* _OSC state of control bits */ + u32 osc_control_qry; /* the latest _OSC query result */ + + u32 osc_queried:1; /* has _OSC control been queried? */ }; static LIST_HEAD(acpi_pci_roots); static struct acpi_pci_driver *sub_driver; +static DEFINE_MUTEX(osc_lock); int acpi_pci_register_driver(struct acpi_pci_driver *driver) { @@ -185,6 +192,175 @@ static void acpi_pci_bridge_scan(struct acpi_device *device) } } +static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40, + 0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66}; + +static acpi_status acpi_pci_run_osc(acpi_handle handle, + const u32 *capbuf, u32 *retval) +{ + acpi_status status; + struct acpi_object_list input; + union acpi_object in_params[4]; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *out_obj; + u32 errors; + + /* Setting up input parameters */ + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 3; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 12; + in_params[3].buffer.pointer = (u8 *)capbuf; + + status = acpi_evaluate_object(handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return status; + + if (!output.length) + return AE_NULL_OBJECT; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + printk(KERN_DEBUG "_OSC evaluation returned wrong type\n"); + status = AE_TYPE; + goto out_kfree; + } + /* Need to ignore the bit0 in result code */ + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + if (errors & OSC_REQUEST_ERROR) + printk(KERN_DEBUG "_OSC request failed\n"); + if (errors & OSC_INVALID_UUID_ERROR) + printk(KERN_DEBUG "_OSC invalid UUID\n"); + if (errors & OSC_INVALID_REVISION_ERROR) + printk(KERN_DEBUG "_OSC invalid revision\n"); + if (errors & OSC_CAPABILITIES_MASK_ERROR) { + if (capbuf[OSC_QUERY_TYPE] & OSC_QUERY_ENABLE) + goto out_success; + printk(KERN_DEBUG + "Firmware did not grant requested _OSC control\n"); + status = AE_SUPPORT; + goto out_kfree; + } + status = AE_ERROR; + goto out_kfree; + } +out_success: + *retval = *((u32 *)(out_obj->buffer.pointer + 8)); + status = AE_OK; + +out_kfree: + kfree(output.pointer); + return status; +} + +static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, u32 flags) +{ + acpi_status status; + u32 support_set, result, capbuf[3]; + + /* do _OSC query for all possible controls */ + support_set = root->osc_support_set | (flags & OSC_SUPPORT_MASKS); + capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE; + capbuf[OSC_SUPPORT_TYPE] = support_set; + capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS; + + status = acpi_pci_run_osc(root->device->handle, capbuf, &result); + if (ACPI_SUCCESS(status)) { + root->osc_support_set = support_set; + root->osc_control_qry = result; + root->osc_queried = 1; + } + return status; +} + +static acpi_status acpi_pci_osc_support(struct acpi_pci_root *root, u32 flags) +{ + acpi_status status; + acpi_handle tmp; + + status = acpi_get_handle(root->device->handle, "_OSC", &tmp); + if (ACPI_FAILURE(status)) + return status; + mutex_lock(&osc_lock); + status = acpi_pci_query_osc(root, flags); + mutex_unlock(&osc_lock); + return status; +} + +static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle) +{ + struct acpi_pci_root *root; + list_for_each_entry(root, &acpi_pci_roots, node) { + if (root->device->handle == handle) + return root; + } + return NULL; +} + +/** + * pci_osc_control_set - commit requested control to Firmware + * @handle: acpi_handle for the target ACPI object + * @flags: driver's requested control bits + * + * Attempt to take control from Firmware on requested control bits. + **/ +acpi_status pci_osc_control_set(acpi_handle handle, u32 flags) +{ + acpi_status status; + u32 control_req, result, capbuf[3]; + acpi_handle tmp; + struct acpi_pci_root *root; + + status = acpi_get_handle(handle, "_OSC", &tmp); + if (ACPI_FAILURE(status)) + return status; + + control_req = (flags & OSC_CONTROL_MASKS); + if (!control_req) + return AE_TYPE; + + root = acpi_pci_find_root(handle); + if (!root) + return AE_NOT_EXIST; + + mutex_lock(&osc_lock); + /* No need to evaluate _OSC if the control was already granted. */ + if ((root->osc_control_set & control_req) == control_req) + goto out; + + /* Need to query controls first before requesting them */ + if (!root->osc_queried) { + status = acpi_pci_query_osc(root, root->osc_support_set); + if (ACPI_FAILURE(status)) + goto out; + } + if ((root->osc_control_qry & control_req) != control_req) { + printk(KERN_DEBUG + "Firmware did not grant requested _OSC control\n"); + status = AE_SUPPORT; + goto out; + } + + capbuf[OSC_QUERY_TYPE] = 0; + capbuf[OSC_SUPPORT_TYPE] = root->osc_support_set; + capbuf[OSC_CONTROL_TYPE] = root->osc_control_set | control_req; + status = acpi_pci_run_osc(handle, capbuf, &result); + if (ACPI_SUCCESS(status)) + root->osc_control_set = result; +out: + mutex_unlock(&osc_lock); + return status; +} +EXPORT_SYMBOL(pci_osc_control_set); + static int __devinit acpi_pci_root_add(struct acpi_device *device) { int result = 0; @@ -217,7 +393,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device) * PCI domains, so we indicate this in _OSC support capabilities. */ flags = base_flags = OSC_PCI_SEGMENT_GROUPS_SUPPORT; - pci_acpi_osc_support(device->handle, flags); + acpi_pci_osc_support(root, flags); /* * Segment @@ -353,7 +529,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device) if (pci_msi_enabled()) flags |= OSC_MSI_SUPPORT; if (flags != base_flags) - pci_acpi_osc_support(device->handle, flags); + acpi_pci_osc_support(root, flags); end: if (result) { diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index deea8a187eb8..fac5eddcefd2 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -18,221 +18,6 @@ #include #include "pci.h" -struct acpi_osc_data { - acpi_handle handle; - u32 support_set; - u32 control_set; - u32 control_query; - int is_queried; - struct list_head sibiling; -}; -static LIST_HEAD(acpi_osc_data_list); - -struct acpi_osc_args { - u32 capbuf[3]; -}; - -static DEFINE_MUTEX(pci_acpi_lock); - -static struct acpi_osc_data *acpi_get_osc_data(acpi_handle handle) -{ - struct acpi_osc_data *data; - - list_for_each_entry(data, &acpi_osc_data_list, sibiling) { - if (data->handle == handle) - return data; - } - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return NULL; - INIT_LIST_HEAD(&data->sibiling); - data->handle = handle; - list_add_tail(&data->sibiling, &acpi_osc_data_list); - return data; -} - -static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40, - 0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66}; - -static acpi_status acpi_run_osc(acpi_handle handle, - struct acpi_osc_args *osc_args, u32 *retval) -{ - acpi_status status; - struct acpi_object_list input; - union acpi_object in_params[4]; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object *out_obj; - u32 errors, flags = osc_args->capbuf[OSC_QUERY_TYPE]; - - /* Setting up input parameters */ - input.count = 4; - input.pointer = in_params; - in_params[0].type = ACPI_TYPE_BUFFER; - in_params[0].buffer.length = 16; - in_params[0].buffer.pointer = OSC_UUID; - in_params[1].type = ACPI_TYPE_INTEGER; - in_params[1].integer.value = 1; - in_params[2].type = ACPI_TYPE_INTEGER; - in_params[2].integer.value = 3; - in_params[3].type = ACPI_TYPE_BUFFER; - in_params[3].buffer.length = 12; - in_params[3].buffer.pointer = (u8 *)osc_args->capbuf; - - status = acpi_evaluate_object(handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return status; - - if (!output.length) - return AE_NULL_OBJECT; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - printk(KERN_DEBUG "Evaluate _OSC returns wrong type\n"); - status = AE_TYPE; - goto out_kfree; - } - /* Need to ignore the bit0 in result code */ - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - if (errors & OSC_REQUEST_ERROR) - printk(KERN_DEBUG "_OSC request fails\n"); - if (errors & OSC_INVALID_UUID_ERROR) - printk(KERN_DEBUG "_OSC invalid UUID\n"); - if (errors & OSC_INVALID_REVISION_ERROR) - printk(KERN_DEBUG "_OSC invalid revision\n"); - if (errors & OSC_CAPABILITIES_MASK_ERROR) { - if (flags & OSC_QUERY_ENABLE) - goto out_success; - printk(KERN_DEBUG "_OSC FW not grant req. control\n"); - status = AE_SUPPORT; - goto out_kfree; - } - status = AE_ERROR; - goto out_kfree; - } -out_success: - *retval = *((u32 *)(out_obj->buffer.pointer + 8)); - status = AE_OK; - -out_kfree: - kfree(output.pointer); - return status; -} - -static acpi_status __acpi_query_osc(u32 flags, struct acpi_osc_data *osc_data) -{ - acpi_status status; - u32 support_set, result; - struct acpi_osc_args osc_args; - - /* do _OSC query for all possible controls */ - support_set = osc_data->support_set | (flags & OSC_SUPPORT_MASKS); - osc_args.capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE; - osc_args.capbuf[OSC_SUPPORT_TYPE] = support_set; - osc_args.capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS; - - status = acpi_run_osc(osc_data->handle, &osc_args, &result); - if (ACPI_SUCCESS(status)) { - osc_data->support_set = support_set; - osc_data->control_query = result; - osc_data->is_queried = 1; - } - - return status; -} - -/* - * pci_acpi_osc_support: Invoke _OSC indicating support for the given feature - * @flags: Bitmask of flags to support - * - * See the ACPI spec for the definition of the flags - */ -int pci_acpi_osc_support(acpi_handle handle, u32 flags) -{ - acpi_status status; - acpi_handle tmp; - struct acpi_osc_data *osc_data; - int rc = 0; - - status = acpi_get_handle(handle, "_OSC", &tmp); - if (ACPI_FAILURE(status)) - return -ENOTTY; - - mutex_lock(&pci_acpi_lock); - osc_data = acpi_get_osc_data(handle); - if (!osc_data) { - printk(KERN_ERR "acpi osc data array is full\n"); - rc = -ENOMEM; - goto out; - } - - __acpi_query_osc(flags, osc_data); -out: - mutex_unlock(&pci_acpi_lock); - return rc; -} - -/** - * pci_osc_control_set - commit requested control to Firmware - * @handle: acpi_handle for the target ACPI object - * @flags: driver's requested control bits - * - * Attempt to take control from Firmware on requested control bits. - **/ -acpi_status pci_osc_control_set(acpi_handle handle, u32 flags) -{ - acpi_status status; - u32 control_req, control_set, result; - acpi_handle tmp; - struct acpi_osc_data *osc_data; - struct acpi_osc_args osc_args; - - status = acpi_get_handle(handle, "_OSC", &tmp); - if (ACPI_FAILURE(status)) - return status; - - mutex_lock(&pci_acpi_lock); - osc_data = acpi_get_osc_data(handle); - if (!osc_data) { - printk(KERN_ERR "acpi osc data array is full\n"); - status = AE_ERROR; - goto out; - } - - control_req = (flags & OSC_CONTROL_MASKS); - if (!control_req) { - status = AE_TYPE; - goto out; - } - - /* No need to evaluate _OSC if the control was already granted. */ - if ((osc_data->control_set & control_req) == control_req) - goto out; - - if (!osc_data->is_queried) { - status = __acpi_query_osc(osc_data->support_set, osc_data); - if (ACPI_FAILURE(status)) - goto out; - } - - if ((osc_data->control_query & control_req) != control_req) { - status = AE_SUPPORT; - goto out; - } - - control_set = osc_data->control_set | control_req; - osc_args.capbuf[OSC_QUERY_TYPE] = 0; - osc_args.capbuf[OSC_SUPPORT_TYPE] = osc_data->support_set; - osc_args.capbuf[OSC_CONTROL_TYPE] = control_set; - status = acpi_run_osc(handle, &osc_args, &result); - if (ACPI_SUCCESS(status)) - osc_data->control_set = result; -out: - mutex_unlock(&pci_acpi_lock); - return status; -} -EXPORT_SYMBOL(pci_osc_control_set); - /* * _SxD returns the D-state with the highest power * (lowest D-state number) supported in the S-state "x". diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 042c166f65d5..65cb103b21db 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -50,7 +50,6 @@ #ifdef CONFIG_ACPI extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags); -int pci_acpi_osc_support(acpi_handle handle, u32 flags); static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { /* Find root host bridge */ From 9f5404d8ea90bfa4d58a3936e5a3d0d28cecf60f Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Mon, 9 Feb 2009 16:00:04 +0900 Subject: [PATCH 089/478] PCI/ACPI: rename pci_osc_control_set() - Rename pci_osc_control_set() to acpi_pci_osc_control_set() according to the other API names in drivers/acpi/pci_root.c. - Move _OSC related definitions to include/linux/acpi.h because _OSC related API is implemented in drivers/acpi/pci_root.c now. Signed-off-by: Kenji Kaneshige Reviewed-by: Andrew Patterson Tested-by: Andrew Patterson Signed-off-by: Jesse Barnes --- drivers/acpi/pci_root.c | 6 ++-- drivers/pci/hotplug/acpi_pcihp.c | 5 ++-- drivers/pci/pcie/aer/aerdrv_acpi.c | 2 +- include/linux/acpi.h | 34 ++++++++++++++++++++++ include/linux/pci-acpi.h | 45 ------------------------------ 5 files changed, 40 insertions(+), 52 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 979eccc82c5b..196f97d00956 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -306,13 +306,13 @@ static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle) } /** - * pci_osc_control_set - commit requested control to Firmware + * acpi_pci_osc_control_set - commit requested control to Firmware * @handle: acpi_handle for the target ACPI object * @flags: driver's requested control bits * * Attempt to take control from Firmware on requested control bits. **/ -acpi_status pci_osc_control_set(acpi_handle handle, u32 flags) +acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags) { acpi_status status; u32 control_req, result, capbuf[3]; @@ -359,7 +359,7 @@ out: mutex_unlock(&osc_lock); return status; } -EXPORT_SYMBOL(pci_osc_control_set); +EXPORT_SYMBOL(acpi_pci_osc_control_set); static int __devinit acpi_pci_root_add(struct acpi_device *device) { diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c index 1c1141801060..f47bc74be567 100644 --- a/drivers/pci/hotplug/acpi_pcihp.c +++ b/drivers/pci/hotplug/acpi_pcihp.c @@ -30,9 +30,8 @@ #include #include #include +#include #include -#include -#include #define MY_NAME "acpi_pcihp" @@ -408,7 +407,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags) acpi_get_name(handle, ACPI_FULL_PATHNAME, &string); dbg("Trying to get hotplug control for %s\n", (char *)string.pointer); - status = pci_osc_control_set(handle, flags); + status = acpi_pci_osc_control_set(handle, flags); if (ACPI_SUCCESS(status)) goto got_one; kfree(string.pointer); diff --git a/drivers/pci/pcie/aer/aerdrv_acpi.c b/drivers/pci/pcie/aer/aerdrv_acpi.c index ebce26c37049..8edb2f300e8f 100644 --- a/drivers/pci/pcie/aer/aerdrv_acpi.c +++ b/drivers/pci/pcie/aer/aerdrv_acpi.c @@ -38,7 +38,7 @@ int aer_osc_setup(struct pcie_device *pciedev) handle = acpi_find_root_bridge_handle(pdev); if (handle) { - status = pci_osc_control_set(handle, + status = acpi_pci_osc_control_set(handle, OSC_PCI_EXPRESS_AER_CONTROL | OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL); } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6fce2fc2d124..2a3b189e3e26 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -256,6 +256,40 @@ void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); void __init acpi_s4_no_nvs(void); #endif /* CONFIG_PM_SLEEP */ + +#define OSC_QUERY_TYPE 0 +#define OSC_SUPPORT_TYPE 1 +#define OSC_CONTROL_TYPE 2 +#define OSC_SUPPORT_MASKS 0x1f + +/* _OSC DW0 Definition */ +#define OSC_QUERY_ENABLE 1 +#define OSC_REQUEST_ERROR 2 +#define OSC_INVALID_UUID_ERROR 4 +#define OSC_INVALID_REVISION_ERROR 8 +#define OSC_CAPABILITIES_MASK_ERROR 16 + +/* _OSC DW1 Definition (OS Support Fields) */ +#define OSC_EXT_PCI_CONFIG_SUPPORT 1 +#define OSC_ACTIVE_STATE_PWR_SUPPORT 2 +#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT 4 +#define OSC_PCI_SEGMENT_GROUPS_SUPPORT 8 +#define OSC_MSI_SUPPORT 16 + +/* _OSC DW1 Definition (OS Control Fields) */ +#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 1 +#define OSC_SHPC_NATIVE_HP_CONTROL 2 +#define OSC_PCI_EXPRESS_PME_CONTROL 4 +#define OSC_PCI_EXPRESS_AER_CONTROL 8 +#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL 16 + +#define OSC_CONTROL_MASKS (OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | \ + OSC_SHPC_NATIVE_HP_CONTROL | \ + OSC_PCI_EXPRESS_PME_CONTROL | \ + OSC_PCI_EXPRESS_AER_CONTROL | \ + OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL) + +extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags); #else /* CONFIG_ACPI */ static inline int early_acpi_boot_init(void) diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 65cb103b21db..20480b9f10c8 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -10,46 +10,7 @@ #include -#define OSC_QUERY_TYPE 0 -#define OSC_SUPPORT_TYPE 1 -#define OSC_CONTROL_TYPE 2 -#define OSC_SUPPORT_MASKS 0x1f - -/* - * _OSC DW0 Definition - */ -#define OSC_QUERY_ENABLE 1 -#define OSC_REQUEST_ERROR 2 -#define OSC_INVALID_UUID_ERROR 4 -#define OSC_INVALID_REVISION_ERROR 8 -#define OSC_CAPABILITIES_MASK_ERROR 16 - -/* - * _OSC DW1 Definition (OS Support Fields) - */ -#define OSC_EXT_PCI_CONFIG_SUPPORT 1 -#define OSC_ACTIVE_STATE_PWR_SUPPORT 2 -#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT 4 -#define OSC_PCI_SEGMENT_GROUPS_SUPPORT 8 -#define OSC_MSI_SUPPORT 16 - -/* - * _OSC DW1 Definition (OS Control Fields) - */ -#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 1 -#define OSC_SHPC_NATIVE_HP_CONTROL 2 -#define OSC_PCI_EXPRESS_PME_CONTROL 4 -#define OSC_PCI_EXPRESS_AER_CONTROL 8 -#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL 16 - -#define OSC_CONTROL_MASKS (OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | \ - OSC_SHPC_NATIVE_HP_CONTROL | \ - OSC_PCI_EXPRESS_PME_CONTROL | \ - OSC_PCI_EXPRESS_AER_CONTROL | \ - OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL) - #ifdef CONFIG_ACPI -extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags); static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { /* Find root host bridge */ @@ -69,12 +30,6 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) return acpi_get_pci_rootbridge_handle(seg, busnr); } #else -#if !defined(AE_ERROR) -typedef u32 acpi_status; -#define AE_ERROR (acpi_status) (0x0001) -#endif -static inline acpi_status pci_osc_control_set(acpi_handle handle, u32 flags) -{return AE_ERROR;} static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { return NULL; } #endif From 35e1801ea637810830e653ffe7ff62c7048ae03a Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 11 Feb 2009 21:13:45 +0100 Subject: [PATCH 090/478] PCI hotplug: shpchp: fix bus number check to avoid false positive With for (busnr = 0; busnr <= end; busnr++) { ... } busnr reaches end + 1 after the loop. So fix the "no busses available" check to look for just busnr > end rather than >=. Signed-off-by: Roel Kluin Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/shpchp_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/shpchp_pci.c b/drivers/pci/hotplug/shpchp_pci.c index 138f161becc0..aa315e52529b 100644 --- a/drivers/pci/hotplug/shpchp_pci.c +++ b/drivers/pci/hotplug/shpchp_pci.c @@ -137,7 +137,7 @@ int __ref shpchp_configure_device(struct slot *p_slot) busnr)) break; } - if (busnr >= end) { + if (busnr > end) { ctrl_err(ctrl, "No free bus for hot-added bridge\n"); pci_dev_put(dev); From b5fbf53324f65646154e172af350674d5a2a1629 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 Feb 2009 22:27:02 +1100 Subject: [PATCH 091/478] PCI/MSI: Allow arch code to return the number of MSI-X available There is code in msix_capability_init() which, when the requested number of MSI-X couldn't be allocated, calculates how many MSI-X /could/ be allocated and returns that to the driver. That allows the driver to then make a second request, with a number of MSIs that should succeed. The current code requires the arch code to setup as many msi_descs as it can, and then return to the generic code. On some platforms the arch code may already know how many MSI-X it can allocate, before it sets up any of the msi_descs. So change the logic such that if the arch code returns a positive error code, that is taken to be the number of MSI-X that could be allocated. If the error code is negative we still calculate the number available using the old method. Because it's a little subtle, make sure the error return code from arch_setup_msi_irq() is always negative. That way only implementations of arch_setup_msi_irqs() need to be careful about returning a positive error code. Signed-off-by: Michael Ellerman Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 33adf323f064..dceea56f7342 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -42,8 +42,10 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) list_for_each_entry(entry, &dev->msi_list, list) { ret = arch_setup_msi_irq(dev, entry); - if (ret) + if (ret < 0) return ret; + if (ret > 0) + return -ENOSPC; } return 0; @@ -487,7 +489,9 @@ static int msix_capability_init(struct pci_dev *dev, } ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX); - if (ret) { + if (ret < 0) { + /* If we had some success report the number of irqs + * we succeeded in setting up. */ int avail = 0; list_for_each_entry(entry, &dev->msi_list, list) { if (entry->irq != 0) { @@ -495,14 +499,13 @@ static int msix_capability_init(struct pci_dev *dev, } } - msi_free_irqs(dev); + if (avail != 0) + ret = avail; + } - /* If we had some success report the number of irqs - * we succeeded in setting up. - */ - if (avail == 0) - avail = ret; - return avail; + if (ret) { + msi_free_irqs(dev); + return ret; } i = 0; From c48f1670f42b71f39f4a3bfba01ffb691cc9206c Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Tue, 3 Feb 2009 15:45:26 -0800 Subject: [PATCH 092/478] PCI: constify pci_bus_add_devices() drivers/pci/hotplug/fakephp.c:283: warning: passing argument 1 of 'pci_bus_add_devices' discards qualifiers from pointer target type Signed-off-by: Andrew Morton Signed-off-by: Jesse Barnes --- drivers/pci/bus.c | 2 +- include/linux/pci.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 52b54f053be0..118c77778d29 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -133,7 +133,7 @@ int pci_bus_add_child(struct pci_bus *bus) * * Call hotplug for each new devices. */ -void pci_bus_add_devices(struct pci_bus *bus) +void pci_bus_add_devices(const struct pci_bus *bus) { struct pci_dev *dev; struct pci_bus *child; diff --git a/include/linux/pci.h b/include/linux/pci.h index b5d6d0e0f1cb..a1af2fe00639 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -528,7 +528,7 @@ void pcibios_update_irq(struct pci_dev *, int irq); /* Generic PCI functions used internally */ extern struct pci_bus *pci_find_bus(int domain, int busnr); -void pci_bus_add_devices(struct pci_bus *bus); +void pci_bus_add_devices(const struct pci_bus *bus); struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata); static inline struct pci_bus * __devinit pci_scan_bus(int bus, struct pci_ops *ops, From ea7415512a07add2b09c070c9a5d1950833cf9b3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 18 Feb 2009 10:44:29 -0800 Subject: [PATCH 093/478] PCI: constify pci_bus_assign_resources() drivers/pci/hotplug/fakephp.c: In function 'pci_rescan_bus': drivers/pci/hotplug/fakephp.c:271: warning: passing argument 1 of 'pci_bus_assign_resources' discards qualifiers from pointer target type Signed-off-by: Andrew Morton Signed-off-by: Jesse Barnes --- drivers/pci/setup-bus.c | 4 ++-- include/linux/pci.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 704608945780..170a3eda9dd3 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -27,7 +27,7 @@ #include -static void pbus_assign_resources_sorted(struct pci_bus *bus) +static void pbus_assign_resources_sorted(const struct pci_bus *bus) { struct pci_dev *dev; struct resource *res; @@ -495,7 +495,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus) } EXPORT_SYMBOL(pci_bus_size_bridges); -void __ref pci_bus_assign_resources(struct pci_bus *bus) +void __ref pci_bus_assign_resources(const struct pci_bus *bus) { struct pci_bus *b; struct pci_dev *dev; diff --git a/include/linux/pci.h b/include/linux/pci.h index a1af2fe00639..7baf2a5db12a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -708,7 +708,7 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void int pci_vpd_truncate(struct pci_dev *dev, size_t size); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ -void pci_bus_assign_resources(struct pci_bus *bus); +void pci_bus_assign_resources(const struct pci_bus *bus); void pci_bus_size_bridges(struct pci_bus *bus); int pci_claim_resource(struct pci_dev *, int); void pci_assign_unassigned_resources(void); From 10a0ef39fbd1d484c2bbc1ffd83d57ecef209140 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 17 Feb 2009 13:46:53 +0300 Subject: [PATCH 094/478] PCI/alpha: pci sysfs resources This closes http://bugzilla.kernel.org/show_bug.cgi?id=10893 which is a showstopper for X development on alpha. The generic HAVE_PCI_MMAP code (drivers/pci-sysfs.c) is not very useful since we have to deal with three different types of MMIO address spaces: sparse and dense mappings for old ev4/ev5 machines and "normal" 1:1 MMIO space (bwx) for ev56 and later. Also "write combine" mappings are meaningless on alpha - roughly speaking, alpha does write combining, IO reordering and other optimizations by default, unless user splits IO accesses with memory barriers. I think the cleanest way to deal with resource files on alpha is to convert the default no-op pci_create_resource_files() and pci_remove_resource_files() for !HAVE_PCI_MMAP case into __weak functions and override them with alpha specific ones. Another alpha hook is needed for "legacy_" resource files to handle sparse addressing (pci_adjust_legacy_attr). With the "standard" resourceN files on ev56/ev6 libpciaccess works "out of the box". Handling of resourceN_sparse/resourceN_dense files on older machines obviously requires some userland work. Sparse/dense stuff has been tested on sx164 (pca56/pyxis, normally uses bwx IO) with the kernel hacked into "cia compatible" mode. Signed-off-by: Ivan Kokshaysky Signed-off-by: Jesse Barnes --- arch/alpha/include/asm/pci.h | 14 ++ arch/alpha/kernel/Makefile | 2 +- arch/alpha/kernel/pci-sysfs.c | 366 ++++++++++++++++++++++++++++++++++ drivers/pci/pci-sysfs.c | 19 +- 4 files changed, 398 insertions(+), 3 deletions(-) create mode 100644 arch/alpha/kernel/pci-sysfs.c diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h index 2a14302c17a3..cb04eaa6ba33 100644 --- a/arch/alpha/include/asm/pci.h +++ b/arch/alpha/include/asm/pci.h @@ -273,4 +273,18 @@ struct pci_dev *alpha_gendev_to_pci(struct device *dev); extern struct pci_dev *isa_bridge; +extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, + size_t count); +extern int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val, + size_t count); +extern int pci_mmap_legacy_page_range(struct pci_bus *bus, + struct vm_area_struct *vma, + enum pci_mmap_state mmap_state); +extern void pci_adjust_legacy_attr(struct pci_bus *bus, + enum pci_mmap_state mmap_type); +#define HAVE_PCI_LEGACY 1 + +extern int pci_create_resource_files(struct pci_dev *dev); +extern void pci_remove_resource_files(struct pci_dev *dev); + #endif /* __ALPHA_PCI_H */ diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile index b4697759a123..a427538252f8 100644 --- a/arch/alpha/kernel/Makefile +++ b/arch/alpha/kernel/Makefile @@ -12,7 +12,7 @@ obj-y := entry.o traps.o process.o init_task.o osf_sys.o irq.o \ obj-$(CONFIG_VGA_HOSE) += console.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_PCI) += pci.o pci_iommu.o +obj-$(CONFIG_PCI) += pci.o pci_iommu.o pci-sysfs.o obj-$(CONFIG_SRM_ENV) += srm_env.o obj-$(CONFIG_MODULES) += module.o diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c new file mode 100644 index 000000000000..6ea822e7f724 --- /dev/null +++ b/arch/alpha/kernel/pci-sysfs.c @@ -0,0 +1,366 @@ +/* + * arch/alpha/kernel/pci-sysfs.c + * + * Copyright (C) 2009 Ivan Kokshaysky + * + * Alpha PCI resource files. + * + * Loosely based on generic HAVE_PCI_MMAP implementation in + * drivers/pci/pci-sysfs.c + */ + +#include +#include + +static int hose_mmap_page_range(struct pci_controller *hose, + struct vm_area_struct *vma, + enum pci_mmap_state mmap_type, int sparse) +{ + unsigned long base; + + if (mmap_type == pci_mmap_mem) + base = sparse ? hose->sparse_mem_base : hose->dense_mem_base; + else + base = sparse ? hose->sparse_io_base : hose->dense_io_base; + + vma->vm_pgoff += base >> PAGE_SHIFT; + vma->vm_flags |= (VM_IO | VM_RESERVED); + + return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + +static int __pci_mmap_fits(struct pci_dev *pdev, int num, + struct vm_area_struct *vma, int sparse) +{ + unsigned long nr, start, size; + int shift = sparse ? 5 : 0; + + nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + start = vma->vm_pgoff; + size = ((pci_resource_len(pdev, num) - 1) >> (PAGE_SHIFT - shift)) + 1; + + if (start < size && size - start >= nr) + return 1; + WARN(1, "process \"%s\" tried to map%s 0x%08lx-0x%08lx on %s BAR %d " + "(size 0x%08lx)\n", + current->comm, sparse ? " sparse" : "", start, start + nr, + pci_name(pdev), num, size); + return 0; +} + +/** + * pci_mmap_resource - map a PCI resource into user memory space + * @kobj: kobject for mapping + * @attr: struct bin_attribute for the file being mapped + * @vma: struct vm_area_struct passed into the mmap + * @sparse: address space type + * + * Use the bus mapping routines to map a PCI resource into userspace. + */ +static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma, int sparse) +{ + struct pci_dev *pdev = to_pci_dev(container_of(kobj, + struct device, kobj)); + struct resource *res = (struct resource *)attr->private; + enum pci_mmap_state mmap_type; + struct pci_bus_region bar; + int i; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) + if (res == &pdev->resource[i]) + break; + if (i >= PCI_ROM_RESOURCE) + return -ENODEV; + + if (!__pci_mmap_fits(pdev, i, vma, sparse)) + return -EINVAL; + + if (iomem_is_exclusive(res->start)) + return -EINVAL; + + pcibios_resource_to_bus(pdev, &bar, res); + vma->vm_pgoff += bar.start >> (PAGE_SHIFT - (sparse ? 5 : 0)); + mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io; + + return hose_mmap_page_range(pdev->sysdata, vma, mmap_type, sparse); +} + +static int pci_mmap_resource_sparse(struct kobject *kobj, + struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 1); +} + +static int pci_mmap_resource_dense(struct kobject *kobj, + struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 0); +} + +/** + * pci_remove_resource_files - cleanup resource files + * @dev: dev to cleanup + * + * If we created resource files for @dev, remove them from sysfs and + * free their resources. + */ +void pci_remove_resource_files(struct pci_dev *pdev) +{ + int i; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + struct bin_attribute *res_attr; + + res_attr = pdev->res_attr[i]; + if (res_attr) { + sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); + kfree(res_attr); + } + + res_attr = pdev->res_attr_wc[i]; + if (res_attr) { + sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); + kfree(res_attr); + } + } +} + +static int sparse_mem_mmap_fits(struct pci_dev *pdev, int num) +{ + struct pci_bus_region bar; + struct pci_controller *hose = pdev->sysdata; + long dense_offset; + unsigned long sparse_size; + + pcibios_resource_to_bus(pdev, &bar, &pdev->resource[num]); + + /* All core logic chips have 4G sparse address space, except + CIA which has 16G (see xxx_SPARSE_MEM and xxx_DENSE_MEM + definitions in asm/core_xxx.h files). This corresponds + to 128M or 512M of the bus space. */ + dense_offset = (long)(hose->dense_mem_base - hose->sparse_mem_base); + sparse_size = dense_offset >= 0x400000000UL ? 0x20000000 : 0x8000000; + + return bar.end < sparse_size; +} + +static int pci_create_one_attr(struct pci_dev *pdev, int num, char *name, + char *suffix, struct bin_attribute *res_attr, + unsigned long sparse) +{ + size_t size = pci_resource_len(pdev, num); + + sprintf(name, "resource%d%s", num, suffix); + res_attr->mmap = sparse ? pci_mmap_resource_sparse : + pci_mmap_resource_dense; + res_attr->attr.name = name; + res_attr->attr.mode = S_IRUSR | S_IWUSR; + res_attr->size = sparse ? size << 5 : size; + res_attr->private = &pdev->resource[num]; + return sysfs_create_bin_file(&pdev->dev.kobj, res_attr); +} + +static int pci_create_attr(struct pci_dev *pdev, int num) +{ + /* allocate attribute structure, piggyback attribute name */ + int retval, nlen1, nlen2 = 0, res_count = 1; + unsigned long sparse_base, dense_base; + struct bin_attribute *attr; + struct pci_controller *hose = pdev->sysdata; + char *suffix, *attr_name; + + suffix = ""; /* Assume bwx machine, normal resourceN files. */ + nlen1 = 10; + + if (pdev->resource[num].flags & IORESOURCE_MEM) { + sparse_base = hose->sparse_mem_base; + dense_base = hose->dense_mem_base; + if (sparse_base && !sparse_mem_mmap_fits(pdev, num)) { + sparse_base = 0; + suffix = "_dense"; + nlen1 = 16; /* resourceN_dense */ + } + } else { + sparse_base = hose->sparse_io_base; + dense_base = hose->dense_io_base; + } + + if (sparse_base) { + suffix = "_sparse"; + nlen1 = 17; + if (dense_base) { + nlen2 = 16; /* resourceN_dense */ + res_count = 2; + } + } + + attr = kzalloc(sizeof(*attr) * res_count + nlen1 + nlen2, GFP_ATOMIC); + if (!attr) + return -ENOMEM; + + /* Create bwx, sparse or single dense file */ + attr_name = (char *)(attr + res_count); + pdev->res_attr[num] = attr; + retval = pci_create_one_attr(pdev, num, attr_name, suffix, attr, + sparse_base); + if (retval || res_count == 1) + return retval; + + /* Create dense file */ + attr_name += nlen1; + attr++; + pdev->res_attr_wc[num] = attr; + return pci_create_one_attr(pdev, num, attr_name, "_dense", attr, 0); +} + +/** + * pci_create_resource_files - create resource files in sysfs for @dev + * @dev: dev in question + * + * Walk the resources in @dev creating files for each resource available. + */ +int pci_create_resource_files(struct pci_dev *pdev) +{ + int i; + int retval; + + /* Expose the PCI resources from this device as files */ + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + + /* skip empty resources */ + if (!pci_resource_len(pdev, i)) + continue; + + retval = pci_create_attr(pdev, i); + if (retval) { + pci_remove_resource_files(pdev); + return retval; + } + } + return 0; +} + +/* Legacy I/O bus mapping stuff. */ + +static int __legacy_mmap_fits(struct pci_controller *hose, + struct vm_area_struct *vma, + unsigned long res_size, int sparse) +{ + unsigned long nr, start, size; + + nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + start = vma->vm_pgoff; + size = ((res_size - 1) >> PAGE_SHIFT) + 1; + + if (start < size && size - start >= nr) + return 1; + WARN(1, "process \"%s\" tried to map%s 0x%08lx-0x%08lx on hose %d " + "(size 0x%08lx)\n", + current->comm, sparse ? " sparse" : "", start, start + nr, + hose->index, size); + return 0; +} + +static inline int has_sparse(struct pci_controller *hose, + enum pci_mmap_state mmap_type) +{ + unsigned long base; + + base = (mmap_type == pci_mmap_mem) ? hose->sparse_mem_base : + hose->sparse_io_base; + + return base != 0; +} + +int pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma, + enum pci_mmap_state mmap_type) +{ + struct pci_controller *hose = bus->sysdata; + int sparse = has_sparse(hose, mmap_type); + unsigned long res_size; + + res_size = (mmap_type == pci_mmap_mem) ? bus->legacy_mem->size : + bus->legacy_io->size; + if (!__legacy_mmap_fits(hose, vma, res_size, sparse)) + return -EINVAL; + + return hose_mmap_page_range(hose, vma, mmap_type, sparse); +} + +/** + * pci_adjust_legacy_attr - adjustment of legacy file attributes + * @b: bus to create files under + * @mmap_type: I/O port or memory + * + * Adjust file name and size for sparse mappings. + */ +void pci_adjust_legacy_attr(struct pci_bus *bus, enum pci_mmap_state mmap_type) +{ + struct pci_controller *hose = bus->sysdata; + + if (!has_sparse(hose, mmap_type)) + return; + + if (mmap_type == pci_mmap_mem) { + bus->legacy_mem->attr.name = "legacy_mem_sparse"; + bus->legacy_mem->size <<= 5; + } else { + bus->legacy_io->attr.name = "legacy_io_sparse"; + bus->legacy_io->size <<= 5; + } + return; +} + +/* Legacy I/O bus read/write functions */ +int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size) +{ + struct pci_controller *hose = bus->sysdata; + + port += hose->io_space->start; + + switch(size) { + case 1: + *((u8 *)val) = inb(port); + return 1; + case 2: + if (port & 1) + return -EINVAL; + *((u16 *)val) = inw(port); + return 2; + case 4: + if (port & 3) + return -EINVAL; + *((u32 *)val) = inl(port); + return 4; + } + return -EINVAL; +} + +int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val, size_t size) +{ + struct pci_controller *hose = bus->sysdata; + + port += hose->io_space->start; + + switch(size) { + case 1: + outb(port, val); + return 1; + case 2: + if (port & 1) + return -EINVAL; + outw(port, val); + return 2; + case 4: + if (port & 3) + return -EINVAL; + outl(port, val); + return 4; + } + return -EINVAL; +} diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index dfc4e0ddf241..1c8929801400 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -492,6 +492,19 @@ pci_mmap_legacy_io(struct kobject *kobj, struct bin_attribute *attr, return pci_mmap_legacy_page_range(bus, vma, pci_mmap_io); } +/** + * pci_adjust_legacy_attr - adjustment of legacy file attributes + * @b: bus to create files under + * @mmap_type: I/O port or memory + * + * Stub implementation. Can be overridden by arch if necessary. + */ +void __weak +pci_adjust_legacy_attr(struct pci_bus *b, enum pci_mmap_state mmap_type) +{ + return; +} + /** * pci_create_legacy_files - create legacy I/O port and memory files * @b: bus to create files under @@ -518,6 +531,7 @@ void pci_create_legacy_files(struct pci_bus *b) b->legacy_io->read = pci_read_legacy_io; b->legacy_io->write = pci_write_legacy_io; b->legacy_io->mmap = pci_mmap_legacy_io; + pci_adjust_legacy_attr(b, pci_mmap_io); error = device_create_bin_file(&b->dev, b->legacy_io); if (error) goto legacy_io_err; @@ -528,6 +542,7 @@ void pci_create_legacy_files(struct pci_bus *b) b->legacy_mem->size = 1024*1024; b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR; b->legacy_mem->mmap = pci_mmap_legacy_mem; + pci_adjust_legacy_attr(b, pci_mmap_mem); error = device_create_bin_file(&b->dev, b->legacy_mem); if (error) goto legacy_mem_err; @@ -719,8 +734,8 @@ static int pci_create_resource_files(struct pci_dev *pdev) return 0; } #else /* !HAVE_PCI_MMAP */ -static inline int pci_create_resource_files(struct pci_dev *dev) { return 0; } -static inline void pci_remove_resource_files(struct pci_dev *dev) { return; } +int __weak pci_create_resource_files(struct pci_dev *dev) { return 0; } +void __weak pci_remove_resource_files(struct pci_dev *dev) { return; } #endif /* HAVE_PCI_MMAP */ /** From ae40582e9959cdb7bfe4b918be8e3d19f9511798 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Feb 2009 20:16:07 -0800 Subject: [PATCH 095/478] PCI: pcie_portdriver: fix pcie_port_device_remove pcie_port_device_remove currently calls the remove method of port drivers twice. Ouch! We are calling device_for_each_child multiple times for no apparent reason. So make it simple. Place put_device and device_unregister into remove_iter, and throw out the rest. Only call device_for_each_child once. The code is simpler and actually works! Signed-off-by: Eric W. Biederman Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv_core.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 3aea92a92928..569af0015fce 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -456,16 +456,9 @@ int pcie_port_device_resume(struct pci_dev *dev) static int remove_iter(struct device *dev, void *data) { - struct pcie_port_service_driver *service_driver; - if (dev->bus == &pcie_port_bus_type) { - if (dev->driver) { - service_driver = to_service_driver(dev->driver); - if (service_driver->remove) - service_driver->remove(to_pcie_device(dev)); - } - *(unsigned long*)data = (unsigned long)dev; - return 1; + put_device(dev); + device_unregister(dev); } return 0; } @@ -480,18 +473,8 @@ static int remove_iter(struct device *dev, void *data) void pcie_port_device_remove(struct pci_dev *dev) { struct pcie_port_data *port_data = pci_get_drvdata(dev); - int status; - do { - unsigned long device_addr; - - status = device_for_each_child(&dev->dev, &device_addr, remove_iter); - if (status) { - struct device *device = (struct device*)device_addr; - put_device(device); - device_unregister(device); - } - } while (status); + device_for_each_child(&dev->dev, NULL, remove_iter); switch (port_data->port_irq_mode) { case PCIE_PORT_MSIX_MODE: From 3a3c244c9a355105bc193fde873c73727bf87192 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 15 Feb 2009 22:32:48 +0100 Subject: [PATCH 096/478] PCI: PCIe portdrv: Implement pm object Implement pm object for the PCI Express port driver in order to use the new power management framework and reduce the code size. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/pciehp_core.c | 4 ++-- drivers/pci/pcie/aer/aerdrv.c | 6 ------ drivers/pci/pcie/portdrv.h | 4 ++-- drivers/pci/pcie/portdrv_core.c | 14 ++++++-------- drivers/pci/pcie/portdrv_pci.c | 31 +++++++++++++++---------------- include/linux/pcieport_if.h | 2 +- 6 files changed, 26 insertions(+), 35 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 3d21bbba3308..fb254b2454de 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -475,7 +475,7 @@ static void pciehp_remove (struct pcie_device *dev) } #ifdef CONFIG_PM -static int pciehp_suspend (struct pcie_device *dev, pm_message_t state) +static int pciehp_suspend (struct pcie_device *dev) { dev_info(&dev->device, "%s ENTRY\n", __func__); return 0; @@ -503,7 +503,7 @@ static int pciehp_resume (struct pcie_device *dev) } return 0; } -#endif +#endif /* PM */ static struct pcie_port_service_driver hpdriver_portdrv = { .name = PCIE_MODULE_NAME, diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c index e11c03194063..32ade5af927e 100644 --- a/drivers/pci/pcie/aer/aerdrv.c +++ b/drivers/pci/pcie/aer/aerdrv.c @@ -40,9 +40,6 @@ MODULE_LICENSE("GPL"); static int __devinit aer_probe (struct pcie_device *dev); static void aer_remove(struct pcie_device *dev); -static int aer_suspend(struct pcie_device *dev, pm_message_t state) -{return 0;} -static int aer_resume(struct pcie_device *dev) {return 0;} static pci_ers_result_t aer_error_detected(struct pci_dev *dev, enum pci_channel_state error); static void aer_error_resume(struct pci_dev *dev); @@ -61,9 +58,6 @@ static struct pcie_port_service_driver aerdriver = { .probe = aer_probe, .remove = aer_remove, - .suspend = aer_suspend, - .resume = aer_resume, - .err_handler = &aer_error_handlers, .reset_link = aer_root_reset, diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 5b818bd835ef..17ad53868f9f 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -38,8 +38,8 @@ extern struct bus_type pcie_port_bus_type; extern int pcie_port_device_probe(struct pci_dev *dev); extern int pcie_port_device_register(struct pci_dev *dev); #ifdef CONFIG_PM -extern int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state); -extern int pcie_port_device_resume(struct pci_dev *dev); +extern int pcie_port_device_suspend(struct device *dev); +extern int pcie_port_device_resume(struct device *dev); #endif extern void pcie_port_device_remove(struct pci_dev *dev); extern int __must_check pcie_port_bus_register(void); diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 569af0015fce..5a5bfe7cdf5f 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -410,13 +410,12 @@ int pcie_port_device_register(struct pci_dev *dev) static int suspend_iter(struct device *dev, void *data) { struct pcie_port_service_driver *service_driver; - pm_message_t state = * (pm_message_t *) data; if ((dev->bus == &pcie_port_bus_type) && (dev->driver)) { service_driver = to_service_driver(dev->driver); if (service_driver->suspend) - service_driver->suspend(to_pcie_device(dev), state); + service_driver->suspend(to_pcie_device(dev)); } return 0; } @@ -424,11 +423,10 @@ static int suspend_iter(struct device *dev, void *data) /** * pcie_port_device_suspend - suspend port services associated with a PCIe port * @dev: PCI Express port to handle - * @state: Representation of system power management transition in progress */ -int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state) +int pcie_port_device_suspend(struct device *dev) { - return device_for_each_child(&dev->dev, &state, suspend_iter); + return device_for_each_child(dev, NULL, suspend_iter); } static int resume_iter(struct device *dev, void *data) @@ -448,11 +446,11 @@ static int resume_iter(struct device *dev, void *data) * pcie_port_device_suspend - resume port services associated with a PCIe port * @dev: PCI Express port to handle */ -int pcie_port_device_resume(struct pci_dev *dev) +int pcie_port_device_resume(struct device *dev) { - return device_for_each_child(&dev->dev, NULL, resume_iter); + return device_for_each_child(dev, NULL, resume_iter); } -#endif +#endif /* PM */ static int remove_iter(struct device *dev, void *data) { diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index 94d0e2af9bad..a61f4930d676 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -44,21 +44,21 @@ static int pcie_portdrv_restore_config(struct pci_dev *dev) } #ifdef CONFIG_PM -static int pcie_portdrv_suspend(struct pci_dev *dev, pm_message_t state) -{ - return pcie_port_device_suspend(dev, state); +static struct dev_pm_ops pcie_portdrv_pm_ops = { + .suspend = pcie_port_device_suspend, + .resume = pcie_port_device_resume, + .freeze = pcie_port_device_suspend, + .thaw = pcie_port_device_resume, + .poweroff = pcie_port_device_suspend, + .restore = pcie_port_device_resume, +}; -} +#define PCIE_PORTDRV_PM_OPS (&pcie_portdrv_pm_ops) -static int pcie_portdrv_resume(struct pci_dev *dev) -{ - pci_set_master(dev); - return pcie_port_device_resume(dev); -} -#else -#define pcie_portdrv_suspend NULL -#define pcie_portdrv_resume NULL -#endif +#else /* !PM */ + +#define PCIE_PORTDRV_PM_OPS NULL +#endif /* !PM */ /* * pcie_portdrv_probe - Probe PCI-Express port devices @@ -268,10 +268,9 @@ static struct pci_driver pcie_portdriver = { .probe = pcie_portdrv_probe, .remove = pcie_portdrv_remove, - .suspend = pcie_portdrv_suspend, - .resume = pcie_portdrv_resume, - .err_handler = &pcie_portdrv_err_handler, + + .driver.pm = PCIE_PORTDRV_PM_OPS, }; static int __init pcie_portdrv_init(void) diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h index 5d2afcfa6bc1..b4c79545330b 100644 --- a/include/linux/pcieport_if.h +++ b/include/linux/pcieport_if.h @@ -59,7 +59,7 @@ struct pcie_port_service_driver { const char *name; int (*probe) (struct pcie_device *dev); void (*remove) (struct pcie_device *dev); - int (*suspend) (struct pcie_device *dev, pm_message_t state); + int (*suspend) (struct pcie_device *dev); int (*resume) (struct pcie_device *dev); /* Service Error Recovery Handler */ From 0747aaf42d78d26684c6f6b34a4103ff81f571f8 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:11:56 +0900 Subject: [PATCH 097/478] PCI/ACPI: fix wrong assumption in acpi_pci_get_bridge_handle Current acpi_pci_get_bridge_handle() has an assumption that pci_bus->self is NULL on the root pci bus. But it might not true on some platforms. Because of this wrong assumption, current acpi_pci_get_bridge_handle() might return improper ACPI handle. We must check pci_bus->parent instead. This bug is the root cause of the following kernel panic reported by James Bottomley. This problem was introduced by the commit e8c331e963c58b83db24b7d0e39e8c07f687dbc6. The immediate cause was acpi_pci_get_bridge_handle() returned NULL unexpectedly and it was passed as the second argument of acpi_walk_namespace(). pci_hotplug: PCI Hot Plug PCI Core version: 0.5 acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5 BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] acpi_ns_get_next_node+0xb/0x3c PGD 0 Oops: 0000 [#1] SMP last sysfs file: CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.28 #1 RIP: 0010:[] [] acpi_ns_get_next_node+0xb/0x3c RSP: 0018:ffff88007f87fd30 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffffffff8037d260 R09: ffff88007f87fdfc R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffffffff80742040(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 0000000000000010 CR3: 0000000000201000 CR4: 00000000000006a0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff88007f87e000, task ffff88007f875040) Stack: 0000000000000000 ffffffff803964f5 ffff88007f81b728 0000000000001001 ffff88007f87fdfc ffffffff8037d260 0000000600000001 0000000000000000 ffffffff8037d260 0000000000000000 0000000000000001 ffff88007f87fdfc Call Trace: [] acpi_ns_walk_namespace+0x55/0x138 [] is_pci_dock_device+0x0/0x20 [] is_pci_dock_device+0x0/0x20 [] acpi_walk_namespace+0x5f/0x83 [] detect_ejectable_slots+0x53/0x70 [] add_bridge+0xe8/0x200 [] acpi_walk_namespace+0x6b/0x83 [] acpi_pci_register_driver+0x48/0x61 [] acpiphp_init+0x0/0x58 [] acpiphp_glue_init+0x4c/0x5a [] acpiphp_init+0x37/0x58 [] _stext+0x3b/0x180 [] create_proc_entry+0x58/0xa0 [] register_irq_proc+0xc1/0xe0 [] kernel_init+0x152/0x1ac [] finish_task_switch+0x0/0x110 [] child_rip+0xa/0x20 [] restore_args+0x0/0x30 [] kernel_init+0x0/0x1ac [] child_rip+0x0/0x20 Code: 89 c2 48 8b 00 48 85 c0 75 f5 48 8b 45 00 48 89 02 44 88 65 09 48 89 5d 00 31 c0 5b 5d 41 5c c3 53 48 85 d2 89 fb 48 89 d7 75 06 <48> 8b 56 10 eb 08 e8 73 f1 ff ff 48 89 c2 85 db 74 1a eb 13 0f RIP [] acpi_ns_get_next_node+0xb/0x3c RSP CR2: 0000000000000010 ---[ end trace a7919e7f17c0a725 ]--- Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- include/linux/pci-acpi.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 20480b9f10c8..3cee2367459f 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -23,11 +23,10 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) { - int seg = pci_domain_nr(pbus), busnr = pbus->number; - struct pci_dev *bridge = pbus->self; - if (bridge) - return DEVICE_ACPI_HANDLE(&(bridge->dev)); - return acpi_get_pci_rootbridge_handle(seg, busnr); + if (pbus->parent) + return DEVICE_ACPI_HANDLE(&(pbus->self->dev)); + return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus), + pbus->number); } #else static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) From d18690af626b83fef1d1953b9f70e09497060586 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:12:36 +0900 Subject: [PATCH 098/478] PCI/ACPI: fix wrong assumption in acpi_find_root_bridge_handle Current acpi_find_root_bridge_handle() has a assumption that pci_bus->self is NULL on the root pci bus. But it might not be true on some platforms. Because of this wrong assumption, current acpi_find_root_bridge_handle() might cause endless loop. We must check pci_bus->parent instead. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- include/linux/pci-acpi.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 3cee2367459f..092e82e0048c 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -13,12 +13,12 @@ #ifdef CONFIG_ACPI static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { - /* Find root host bridge */ - while (pdev->bus->self) - pdev = pdev->bus->self; - - return acpi_get_pci_rootbridge_handle(pci_domain_nr(pdev->bus), - pdev->bus->number); + struct pci_bus *pbus = pdev->bus; + /* Find a PCI root bus */ + while (pbus->parent) + pbus = pbus->parent; + return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus), + pbus->number); } static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) From 267efd7eec5eca62f32f8c9bc1721b578d5da963 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:13:20 +0900 Subject: [PATCH 099/478] PCI hotplug: fix wrong assumption in acpi_get_hp_params_from_firmware Current acpi_get_hp_params_from_firmware() has a assumption that pci_bus->self is NULL on the root pci bus. But it might not true on some platforms. Because of this wrong assumption, current acpi_get_hp_params_from_firmware() might cause endless loop. We must check pci_bus->parent instead. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/acpi_pcihp.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c index f47bc74be567..09a84402986d 100644 --- a/drivers/pci/hotplug/acpi_pcihp.c +++ b/drivers/pci/hotplug/acpi_pcihp.c @@ -332,19 +332,14 @@ acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus, { acpi_status status = AE_NOT_FOUND; acpi_handle handle, phandle; - struct pci_bus *pbus = bus; - struct pci_dev *pdev; + struct pci_bus *pbus; - do { - pdev = pbus->self; - if (!pdev) { - handle = acpi_get_pci_rootbridge_handle( - pci_domain_nr(pbus), pbus->number); + handle = NULL; + for (pbus = bus; pbus; pbus = pbus->parent) { + handle = acpi_pci_get_bridge_handle(pbus); + if (handle) break; - } - handle = DEVICE_ACPI_HANDLE(&(pdev->dev)); - pbus = pbus->parent; - } while (!handle); + } /* * _HPP settings apply to all child buses, until another _HPP is From d391f00f0e7fb6d883c6724b31a1799e19a584c5 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:13:59 +0900 Subject: [PATCH 100/478] PCI hotplug: fix wrong assumption in acpi_get_hp_hw_control_from_firmware Current acpi_get_hp_hw_control_from_firmware() has a assumption that pci_bus->self is NULL on a PCI root bus. But it might not be true on some platforms. Because of this wrong assumption, current acpi_get_hp_hw_control_from_firmware() might cause endless loop. We must check pci_bus->parent instead. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/acpi_pcihp.c | 34 +++++++++++--------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c index 09a84402986d..fbc63d5e459f 100644 --- a/drivers/pci/hotplug/acpi_pcihp.c +++ b/drivers/pci/hotplug/acpi_pcihp.c @@ -372,12 +372,10 @@ EXPORT_SYMBOL_GPL(acpi_get_hp_params_from_firmware); * * Attempt to take hotplug control from firmware. */ -int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags) +int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev, u32 flags) { acpi_status status; acpi_handle chandle, handle; - struct pci_dev *pdev = dev; - struct pci_bus *parent; struct acpi_buffer string = { ACPI_ALLOCATE_BUFFER, NULL }; flags &= (OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | @@ -409,26 +407,18 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags) string = (struct acpi_buffer){ ACPI_ALLOCATE_BUFFER, NULL }; } - pdev = dev; - handle = DEVICE_ACPI_HANDLE(&dev->dev); - while (!handle) { + handle = DEVICE_ACPI_HANDLE(&pdev->dev); + if (!handle) { /* * This hotplug controller was not listed in the ACPI name * space at all. Try to get acpi handle of parent pci bus. */ - if (!pdev || !pdev->bus->parent) - break; - parent = pdev->bus->parent; - dbg("Could not find %s in acpi namespace, trying parent\n", - pci_name(pdev)); - if (!parent->self) - /* Parent must be a host bridge */ - handle = acpi_get_pci_rootbridge_handle( - pci_domain_nr(parent), - parent->number); - else - handle = DEVICE_ACPI_HANDLE(&(parent->self->dev)); - pdev = parent->self; + struct pci_bus *pbus; + for (pbus = pdev->bus; pbus; pbus = pbus->parent) { + handle = acpi_pci_get_bridge_handle(pbus); + if (handle) + break; + } } while (handle) { @@ -447,13 +437,13 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags) } dbg("Cannot get control of hotplug hardware for pci %s\n", - pci_name(dev)); + pci_name(pdev)); kfree(string.pointer); return -ENODEV; got_one: - dbg("Gained control for hotplug HW for pci %s (%s)\n", pci_name(dev), - (char *)string.pointer); + dbg("Gained control for hotplug HW for pci %s (%s)\n", + pci_name(pdev), (char *)string.pointer); kfree(string.pointer); return 0; } From 151ab36a2ea0b3181d103f7244636e0d16e685de Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:14:36 +0900 Subject: [PATCH 101/478] PCI: fix wrong assumption in pci_find_upstream_pcie_bridge Current pci_find_upstream_pcie_bridge() has a wrong assumption that pci_bus->self is NULL on the root pci bus. But it might not true on some platforms. Because of this wrong assumption, current pci_find_upstream_pcie_bridge() might cause endless loop. We must check pci_bus->parent instead. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/search.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 5af8bd538149..710d4ea69568 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -29,7 +29,7 @@ pci_find_upstream_pcie_bridge(struct pci_dev *pdev) if (pdev->is_pcie) return NULL; while (1) { - if (!pdev->bus->self) + if (!pdev->bus->parent) break; pdev = pdev->bus->self; /* a p2p bridge */ From f92d4e29d785f1d4217dee7f1ae6ff7140547ed5 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:15:16 +0900 Subject: [PATCH 102/478] PCI: fix wrong assumption in pci_read_bridge_bases Current pci_read_bridge_bases() has an assumption that pci_bus->self is NULL on the pci root bus (It checks pci_bus->self to see if the pci bus is root bus). But is might not true on some platforms. We must check pci_bus->parent instead. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 55ec44a27e89..23362e8c696f 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -287,7 +287,7 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) struct resource *res; int i; - if (!dev) /* It's a host bus, nothing to read */ + if (!child->parent) /* It's a host bus, nothing to read */ return; if (dev->transparent) { From c2a3072e010943ac749794622f26b3ef54de25be Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:15:45 +0900 Subject: [PATCH 103/478] PCI: fix wrong assumption in pci_get_interrupt_pin Current pci_get_interrupt_pin() seems to have an assumption that pci_bus->self is NULL on the root pci bus. But it might not be true on some platforms. Because of this wrong assumption, current pci_get_interrupt_pin() might cause endless loop. We must check pci_bus->parent instead. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0b3e20f1b6f7..0cfed9e28ea1 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1484,7 +1484,7 @@ pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge) if (!pin) return -1; - while (dev->bus->self) { + while (dev->bus->parent) { pin = pci_swizzle_interrupt_pin(dev, pin); dev = dev->bus->self; } From c74d724462d1845535667f4d3f720e02e3432e53 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 17 Feb 2009 14:16:13 +0900 Subject: [PATCH 104/478] PCI: fix wrong assumption in pci_common_swizzle Current pci_common_swizzle() seems to have a assumption that pci_bus->self is NULL on the pci root bus. But it might not be true on some platforms. Because of this wrong assumption, pci_common_swizzle() might cause endless loop. We must check pci_bus->parent instead. Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0cfed9e28ea1..8310dc2f943b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1504,7 +1504,7 @@ u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp) { u8 pin = *pinp; - while (dev->bus->self) { + while (dev->bus->parent) { pin = pci_swizzle_interrupt_pin(dev, pin); dev = dev->bus->self; } From 998dd7c719f62dcfa91d7bf7f4eb9c160e03d817 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 25 Feb 2009 13:15:52 +0800 Subject: [PATCH 105/478] PCI: fix incorrect mask of PM No_Soft_Reset bit Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- include/linux/pci_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index 027815b4635e..b647a4df59fc 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -235,7 +235,7 @@ #define PCI_PM_CAP_PME_SHIFT 11 /* Start of the PME Mask in PMC */ #define PCI_PM_CTRL 4 /* PM control and status register */ #define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */ -#define PCI_PM_CTRL_NO_SOFT_RESET 0x0004 /* No reset for D3hot->D0 */ +#define PCI_PM_CTRL_NO_SOFT_RESET 0x0008 /* No reset for D3hot->D0 */ #define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */ #define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* Data select (??) */ #define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* Data scale (??) */ From 13bf75766966e1bcc71fae536988caec312eef8f Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 24 Feb 2009 10:38:22 -0700 Subject: [PATCH 106/478] x86: use dev_printk in quirk message This patch changes a VIA PCI quirk to use dev_info() rather than printk(). Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/kernel/pci-dma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b25428533141..022833bb9ff1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -290,8 +290,7 @@ fs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO - "PCI: VIA PCI bridge detected. Disabling DAC.\n"); + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); forbid_dac = 1; } } From 0994375e9614f78657031e04e30019b9cdb62795 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 23 Feb 2009 21:52:23 -0800 Subject: [PATCH 107/478] PCI: add remove_id sysfs entry This adds a remove_id sysfs entry to allow users of new_id to later remove the added dynid. One use case is management tools that want to dynamically bind/unbind devices to pci-stub driver while devices are assigned to KVM guests. Rather than having to track which driver was originally bound to the driver, a mangement tool can simply: Guest uses device Signed-off-by: Chris Wright Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 16 +++++ drivers/pci/pci-driver.c | 80 ++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index e638e15a8895..3d29793a0ea2 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -41,6 +41,22 @@ Description: for the device and attempt to bind to it. For example: # echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id +What: /sys/bus/pci/drivers/.../remove_id +Date: February 2009 +Contact: Chris Wright +Description: + Writing a device ID to this file will remove an ID + that was dynamically added via the new_id sysfs entry. + The format for the device ID is: + VVVV DDDD SVVV SDDD CCCC MMMM. That is Vendor ID, Device + ID, Subsystem Vendor ID, Subsystem Device ID, Class, + and Class Mask. The Vendor ID and Device ID fields are + required, the rest are optional. After successfully + removing an ID, the driver will no longer support the + device. This is useful to ensure auto probing won't + match the driver to the device. For example: + # echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 93eac1423585..87a5ddbb324f 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -99,6 +99,52 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count) } static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id); +/** + * store_remove_id - remove a PCI device ID from this driver + * @driver: target device driver + * @buf: buffer for scanning device ID data + * @count: input size + * + * Removes a dynamic pci device ID to this driver. + */ +static ssize_t +store_remove_id(struct device_driver *driver, const char *buf, size_t count) +{ + struct pci_dynid *dynid, *n; + struct pci_driver *pdrv = to_pci_driver(driver); + __u32 vendor, device, subvendor = PCI_ANY_ID, + subdevice = PCI_ANY_ID, class = 0, class_mask = 0; + int fields = 0; + int retval = -ENODEV; + + fields = sscanf(buf, "%x %x %x %x %x %x", + &vendor, &device, &subvendor, &subdevice, + &class, &class_mask); + if (fields < 2) + return -EINVAL; + + spin_lock(&pdrv->dynids.lock); + list_for_each_entry_safe(dynid, n, &pdrv->dynids.list, node) { + struct pci_device_id *id = &dynid->id; + if ((id->vendor == vendor) && + (id->device == device) && + (subvendor == PCI_ANY_ID || id->subvendor == subvendor) && + (subdevice == PCI_ANY_ID || id->subdevice == subdevice) && + !((id->class ^ class) & class_mask)) { + list_del(&dynid->node); + kfree(dynid); + retval = 0; + break; + } + } + spin_unlock(&pdrv->dynids.lock); + + if (retval) + return retval; + return count; +} +static DRIVER_ATTR(remove_id, S_IWUSR, NULL, store_remove_id); + static void pci_free_dynids(struct pci_driver *drv) { @@ -125,6 +171,20 @@ static void pci_remove_newid_file(struct pci_driver *drv) { driver_remove_file(&drv->driver, &driver_attr_new_id); } + +static int +pci_create_removeid_file(struct pci_driver *drv) +{ + int error = 0; + if (drv->probe != NULL) + error = driver_create_file(&drv->driver,&driver_attr_remove_id); + return error; +} + +static void pci_remove_removeid_file(struct pci_driver *drv) +{ + driver_remove_file(&drv->driver, &driver_attr_remove_id); +} #else /* !CONFIG_HOTPLUG */ static inline void pci_free_dynids(struct pci_driver *drv) {} static inline int pci_create_newid_file(struct pci_driver *drv) @@ -132,6 +192,11 @@ static inline int pci_create_newid_file(struct pci_driver *drv) return 0; } static inline void pci_remove_newid_file(struct pci_driver *drv) {} +static inline int pci_create_removeid_file(struct pci_driver *drv) +{ + return 0; +} +static inline void pci_remove_removeid_file(struct pci_driver *drv) {} #endif /** @@ -852,13 +917,23 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner, /* register with core */ error = driver_register(&drv->driver); if (error) - return error; + goto out; error = pci_create_newid_file(drv); if (error) - driver_unregister(&drv->driver); + goto out_newid; + error = pci_create_removeid_file(drv); + if (error) + goto out_removeid; +out: return error; + +out_removeid: + pci_remove_newid_file(drv); +out_newid: + driver_unregister(&drv->driver); + goto out; } /** @@ -874,6 +949,7 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner, void pci_unregister_driver(struct pci_driver *drv) { + pci_remove_removeid_file(drv); pci_remove_newid_file(drv); driver_unregister(&drv->driver); pci_free_dynids(drv); From c41ade2ee1dc146d2de2ee470a87cd6b878a08f4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:05 -0400 Subject: [PATCH 108/478] Rewrite MSI-HOWTO I didn't find the previous version very useful, so I rewrote it. Signed-off-by: Matthew Wilcox Reviewed-by: Randy Dunlap Reviewed-by: Grant Grundler Signed-off-by: Jesse Barnes --- Documentation/PCI/MSI-HOWTO.txt | 648 +++++++++++--------------------- 1 file changed, 222 insertions(+), 426 deletions(-) diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 256defd7e174..1c02431f1d1a 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -4,506 +4,302 @@ Revised Feb 12, 2004 by Martine Silbermann email: Martine.Silbermann@hp.com Revised Jun 25, 2004 by Tom L Nguyen + Revised Jul 9, 2008 by Matthew Wilcox + Copyright 2003, 2008 Intel Corporation 1. About this guide -This guide describes the basics of Message Signaled Interrupts (MSI), -the advantages of using MSI over traditional interrupt mechanisms, -and how to enable your driver to use MSI or MSI-X. Also included is -a Frequently Asked Questions (FAQ) section. +This guide describes the basics of Message Signaled Interrupts (MSIs), +the advantages of using MSI over traditional interrupt mechanisms, how +to change your driver to use MSI or MSI-X and some basic diagnostics to +try if a device doesn't support MSIs. -1.1 Terminology -PCI devices can be single-function or multi-function. In either case, -when this text talks about enabling or disabling MSI on a "device -function," it is referring to one specific PCI device and function and -not to all functions on a PCI device (unless the PCI device has only -one function). +2. What are MSIs? -2. Copyright 2003 Intel Corporation +A Message Signaled Interrupt is a write from the device to a special +address which causes an interrupt to be received by the CPU. -3. What is MSI/MSI-X? +The MSI capability was first specified in PCI 2.2 and was later enhanced +in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X +capability was also introduced with PCI 3.0. It supports more interrupts +per device than MSI and allows interrupts to be independently configured. -Message Signaled Interrupt (MSI), as described in the PCI Local Bus -Specification Revision 2.3 or later, is an optional feature, and a -required feature for PCI Express devices. MSI enables a device function -to request service by sending an Inbound Memory Write on its PCI bus to -the FSB as a Message Signal Interrupt transaction. Because MSI is -generated in the form of a Memory Write, all transaction conditions, -such as a Retry, Master-Abort, Target-Abort or normal completion, are -supported. +Devices may support both MSI and MSI-X, but only one can be enabled at +a time. -A PCI device that supports MSI must also support pin IRQ assertion -interrupt mechanism to provide backward compatibility for systems that -do not support MSI. In systems which support MSI, the bus driver is -responsible for initializing the message address and message data of -the device function's MSI/MSI-X capability structure during device -initial configuration. -An MSI capable device function indicates MSI support by implementing -the MSI/MSI-X capability structure in its PCI capability list. The -device function may implement both the MSI capability structure and -the MSI-X capability structure; however, the bus driver should not -enable both. +3. Why use MSIs? -The MSI capability structure contains Message Control register, -Message Address register and Message Data register. These registers -provide the bus driver control over MSI. The Message Control register -indicates the MSI capability supported by the device. The Message -Address register specifies the target address and the Message Data -register specifies the characteristics of the message. To request -service, the device function writes the content of the Message Data -register to the target address. The device and its software driver -are prohibited from writing to these registers. +There are three reasons why using MSIs can give an advantage over +traditional pin-based interrupts. -The MSI-X capability structure is an optional extension to MSI. It -uses an independent and separate capability structure. There are -some key advantages to implementing the MSI-X capability structure -over the MSI capability structure as described below. +Pin-based PCI interrupts are often shared amongst several devices. +To support this, the kernel must call each interrupt handler associated +with an interrupt, which leads to reduced performance for the system as +a whole. MSIs are never shared, so this problem cannot arise. - - Support a larger maximum number of vectors per function. +When a device writes data to memory, then raises a pin-based interrupt, +it is possible that the interrupt may arrive before all the data has +arrived in memory (this becomes more likely with devices behind PCI-PCI +bridges). In order to ensure that all the data has arrived in memory, +the interrupt handler must read a register on the device which raised +the interrupt. PCI transaction ordering rules require that all the data +arrives in memory before the value can be returned from the register. +Using MSIs avoids this problem as the interrupt-generating write cannot +pass the data writes, so by the time the interrupt is raised, the driver +knows that all the data has arrived in memory. - - Provide the ability for system software to configure - each vector with an independent message address and message - data, specified by a table that resides in Memory Space. +PCI devices can only support a single pin-based interrupt per function. +Often drivers have to query the device to find out what event has +occurred, slowing down interrupt handling for the common case. With +MSIs, a device can support more interrupts, allowing each interrupt +to be specialised to a different purpose. One possible design gives +infrequent conditions (such as errors) their own interrupt which allows +the driver to handle the normal interrupt handling path more efficiently. +Other possible designs include giving one interrupt to each packet queue +in a network card or each port in a storage controller. - - MSI and MSI-X both support per-vector masking. Per-vector - masking is an optional extension of MSI but a required - feature for MSI-X. Per-vector masking provides the kernel the - ability to mask/unmask a single MSI while running its - interrupt service routine. If per-vector masking is - not supported, then the device driver should provide the - hardware/software synchronization to ensure that the device - generates MSI when the driver wants it to do so. -4. Why use MSI? +4. How to use MSIs -As a benefit to the simplification of board design, MSI allows board -designers to remove out-of-band interrupt routing. MSI is another -step towards a legacy-free environment. +PCI devices are initialised to use pin-based interrupts. The device +driver has to set up the device to use MSI or MSI-X. Not all machines +support MSIs correctly, and for those machines, the APIs described below +will simply fail and the device will continue to use pin-based interrupts. -Due to increasing pressure on chipset and processor packages to -reduce pin count, the need for interrupt pins is expected to -diminish over time. Devices, due to pin constraints, may implement -messages to increase performance. +4.1 Include kernel support for MSIs -PCI Express endpoints uses INTx emulation (in-band messages) instead -of IRQ pin assertion. Using INTx emulation requires interrupt -sharing among devices connected to the same node (PCI bridge) while -MSI is unique (non-shared) and does not require BIOS configuration -support. As a result, the PCI Express technology requires MSI -support for better interrupt performance. +To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI +option enabled. This option is only available on some architectures, +and it may depend on some other options also being set. For example, +on x86, you must also enable X86_UP_APIC or SMP in order to see the +CONFIG_PCI_MSI option. -Using MSI enables the device functions to support two or more -vectors, which can be configured to target different CPUs to -increase scalability. +4.2 Using MSI -5. Configuring a driver to use MSI/MSI-X +Most of the hard work is done for the driver in the PCI layer. It simply +has to request that the PCI layer set up the MSI capability for this +device. -By default, the kernel will not enable MSI/MSI-X on all devices that -support this capability. The CONFIG_PCI_MSI kernel option -must be selected to enable MSI/MSI-X support. - -5.1 Including MSI/MSI-X support into the kernel - -To allow MSI/MSI-X capable device drivers to selectively enable -MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described -below), the VECTOR based scheme needs to be enabled by setting -CONFIG_PCI_MSI during kernel config. - -Since the target of the inbound message is the local APIC, providing -CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI. - -5.2 Configuring for MSI support - -Due to the non-contiguous fashion in vector assignment of the -existing Linux kernel, this version does not support multiple -messages regardless of a device function is capable of supporting -more than one vector. To enable MSI on a device function's MSI -capability structure requires a device driver to call the function -pci_enable_msi() explicitly. - -5.2.1 API pci_enable_msi +4.2.1 pci_enable_msi int pci_enable_msi(struct pci_dev *dev) -With this new API, a device driver that wants to have MSI -enabled on its device function must call this API to enable MSI. -A successful call will initialize the MSI capability structure -with ONE vector, regardless of whether a device function is -capable of supporting multiple messages. This vector replaces the -pre-assigned dev->irq with a new MSI vector. To avoid a conflict -of the new assigned vector with existing pre-assigned vector requires -a device driver to call this API before calling request_irq(). +A successful call will allocate ONE interrupt to the device, regardless +of how many MSIs the device supports. The device will be switched from +pin-based interrupt mode to MSI mode. The dev->irq number is changed +to a new number which represents the message signaled interrupt. +This function should be called before the driver calls request_irq() +since enabling MSIs disables the pin-based IRQ and the driver will not +receive interrupts on the old interrupt. -5.2.2 API pci_disable_msi +4.2.2 pci_disable_msi void pci_disable_msi(struct pci_dev *dev) -This API should always be used to undo the effect of pci_enable_msi() -when a device driver is unloading. This API restores dev->irq with -the pre-assigned IOAPIC vector and switches a device's interrupt -mode to PCI pin-irq assertion/INTx emulation mode. +This function should be used to undo the effect of pci_enable_msi(). +Calling it restores dev->irq to the pin-based interrupt number and frees +the previously allocated message signaled interrupt(s). The interrupt +may subsequently be assigned to another device, so drivers should not +cache the value of dev->irq. -Note that a device driver should always call free_irq() on the MSI vector -that it has done request_irq() on before calling this API. Failure to do -so results in a BUG_ON() and a device will be left with MSI enabled and -leaks its vector. +A device driver must always call free_irq() on the interrupt(s) +for which it has called request_irq() before calling this function. +Failure to do so will result in a BUG_ON(), the device will be left with +MSI enabled and will leak its vector. -5.2.3 MSI mode vs. legacy mode diagram +4.3 Using MSI-X -The below diagram shows the events which switch the interrupt -mode on the MSI-capable device function between MSI mode and -PIN-IRQ assertion mode. - - ------------ pci_enable_msi ------------------------ - | | <=============== | | - | MSI MODE | | PIN-IRQ ASSERTION MODE | - | | ===============> | | - ------------ pci_disable_msi ------------------------ - - -Figure 1. MSI Mode vs. Legacy Mode - -In Figure 1, a device operates by default in legacy mode. Legacy -in this context means PCI pin-irq assertion or PCI-Express INTx -emulation. A successful MSI request (using pci_enable_msi()) switches -a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector -stored in dev->irq will be saved by the PCI subsystem and a new -assigned MSI vector will replace dev->irq. - -To return back to its default mode, a device driver should always call -pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a -device driver should always call free_irq() on the MSI vector it has -done request_irq() on before calling pci_disable_msi(). Failure to do -so results in a BUG_ON() and a device will be left with MSI enabled and -leaks its vector. Otherwise, the PCI subsystem restores a device's -dev->irq with a pre-assigned IOAPIC vector and marks the released -MSI vector as unused. - -Once being marked as unused, there is no guarantee that the PCI -subsystem will reserve this MSI vector for a device. Depending on -the availability of current PCI vector resources and the number of -MSI/MSI-X requests from other drivers, this MSI may be re-assigned. - -For the case where the PCI subsystem re-assigns this MSI vector to -another driver, a request to switch back to MSI mode may result -in being assigned a different MSI vector or a failure if no more -vectors are available. - -5.3 Configuring for MSI-X support - -Due to the ability of the system software to configure each vector of -the MSI-X capability structure with an independent message address -and message data, the non-contiguous fashion in vector assignment of -the existing Linux kernel has no impact on supporting multiple -messages on an MSI-X capable device functions. To enable MSI-X on -a device function's MSI-X capability structure requires its device -driver to call the function pci_enable_msix() explicitly. - -The function pci_enable_msix(), once invoked, enables either -all or nothing, depending on the current availability of PCI vector -resources. If the PCI vector resources are available for the number -of vectors requested by a device driver, this function will configure -the MSI-X table of the MSI-X capability structure of a device with -requested messages. To emphasize this reason, for example, a device -may be capable for supporting the maximum of 32 vectors while its -software driver usually may request 4 vectors. It is recommended -that the device driver should call this function once during the -initialization phase of the device driver. - -Unlike the function pci_enable_msi(), the function pci_enable_msix() -does not replace the pre-assigned IOAPIC dev->irq with a new MSI -vector because the PCI subsystem writes the 1:1 vector-to-entry mapping -into the field vector of each element contained in a second argument. -Note that the pre-assigned IOAPIC dev->irq is valid only if the device -operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at -using dev->irq by the device driver to request for interrupt service -may result in unpredictable behavior. - -For each MSI-X vector granted, a device driver is responsible for calling -other functions like request_irq(), enable_irq(), etc. to enable -this vector with its corresponding interrupt service handler. It is -a device driver's choice to assign all vectors with the same -interrupt service handler or each vector with a unique interrupt -service handler. - -5.3.1 Handling MMIO address space of MSI-X Table - -The PCI 3.0 specification has implementation notes that MMIO address -space for a device's MSI-X structure should be isolated so that the -software system can set different pages for controlling accesses to the -MSI-X structure. The implementation of MSI support requires the PCI -subsystem, not a device driver, to maintain full control of the MSI-X -table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X -table/MSI-X PBA. A device driver should not access the MMIO address -space of the MSI-X table/MSI-X PBA. - -5.3.2 API pci_enable_msix - -int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) - -This API enables a device driver to request the PCI subsystem -to enable MSI-X messages on its hardware device. Depending on -the availability of PCI vectors resources, the PCI subsystem enables -either all or none of the requested vectors. - -Argument 'dev' points to the device (pci_dev) structure. - -Argument 'entries' is a pointer to an array of msix_entry structs. -The number of entries is indicated in argument 'nvec'. -struct msix_entry is defined in /driver/pci/msi.h: +The MSI-X capability is much more flexible than the MSI capability. +It supports up to 2048 interrupts, each of which can be controlled +independently. To support this flexibility, drivers must use an array of +`struct msix_entry': struct msix_entry { u16 vector; /* kernel uses to write alloc vector */ u16 entry; /* driver uses to specify entry */ }; -A device driver is responsible for initializing the field 'entry' of -each element with a unique entry supported by MSI-X table. Otherwise, --EINVAL will be returned as a result. A successful return of zero -indicates the PCI subsystem completed initializing each of the requested -entries of the MSI-X table with message address and message data. -Last but not least, the PCI subsystem will write the 1:1 -vector-to-entry mapping into the field 'vector' of each element. A -device driver is responsible for keeping track of allocated MSI-X -vectors in its internal data structure. +This allows for the device to use these interrupts in a sparse fashion; +for example it could use interrupts 3 and 1027 and allocate only a +two-element array. The driver is expected to fill in the 'entry' value +in each element of the array to indicate which entries it wants the kernel +to assign interrupts for. It is invalid to fill in two entries with the +same number. -A return of zero indicates that the number of MSI-X vectors was -successfully allocated. A return of greater than zero indicates -MSI-X vector shortage. Or a return of less than zero indicates -a failure. This failure may be a result of duplicate entries -specified in second argument, or a result of no available vector, -or a result of failing to initialize MSI-X table entries. +4.3.1 pci_enable_msix -5.3.3 API pci_disable_msix +int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) + +Calling this function asks the PCI subsystem to allocate 'nvec' MSIs. +The 'entries' argument is a pointer to an array of msix_entry structs +which should be at least 'nvec' entries in size. On success, the +function will return 0 and the device will have been switched into +MSI-X interrupt mode. The 'vector' elements in each entry will have +been filled in with the interrupt number. The driver should then call +request_irq() for each 'vector' that it decides to use. + +If this function returns a negative number, it indicates an error and +the driver should not attempt to allocate any more MSI-X interrupts for +this device. If it returns a positive number, it indicates the maximum +number of interrupt vectors that could have been allocated. + +This function, in contrast with pci_enable_msi(), does not adjust +dev->irq. The device will not generate interrupts for this interrupt +number once MSI-X is enabled. The device driver is responsible for +keeping track of the interrupts assigned to the MSI-X vectors so it can +free them again later. + +Device drivers should normally call this function once per device +during the initialization phase. + +4.3.2 pci_disable_msix void pci_disable_msix(struct pci_dev *dev) -This API should always be used to undo the effect of pci_enable_msix() -when a device driver is unloading. Note that a device driver should -always call free_irq() on all MSI-X vectors it has done request_irq() -on before calling this API. Failure to do so results in a BUG_ON() and -a device will be left with MSI-X enabled and leaks its vectors. +This API should be used to undo the effect of pci_enable_msix(). It frees +the previously allocated message signaled interrupts. The interrupts may +subsequently be assigned to another device, so drivers should not cache +the value of the 'vector' elements over a call to pci_disable_msix(). -5.3.4 MSI-X mode vs. legacy mode diagram +A device driver must always call free_irq() on the interrupt(s) +for which it has called request_irq() before calling this function. +Failure to do so will result in a BUG_ON(), the device will be left with +MSI enabled and will leak its vector. -The below diagram shows the events which switch the interrupt -mode on the MSI-X capable device function between MSI-X mode and -PIN-IRQ assertion mode (legacy). +4.3.3 The MSI-X Table - ------------ pci_enable_msix(,,n) ------------------------ - | | <=============== | | - | MSI-X MODE | | PIN-IRQ ASSERTION MODE | - | | ===============> | | - ------------ pci_disable_msix ------------------------ +The MSI-X capability specifies a BAR and offset within that BAR for the +MSI-X Table. This address is mapped by the PCI subsystem, and should not +be accessed directly by the device driver. If the driver wishes to +mask or unmask an interrupt, it should call disable_irq() / enable_irq(). -Figure 2. MSI-X Mode vs. Legacy Mode +4.4 Handling devices implementing both MSI and MSI-X capabilities -In Figure 2, a device operates by default in legacy mode. A -successful MSI-X request (using pci_enable_msix()) switches a -device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector -stored in dev->irq will be saved by the PCI subsystem; however, -unlike MSI mode, the PCI subsystem will not replace dev->irq with -assigned MSI-X vector because the PCI subsystem already writes the 1:1 -vector-to-entry mapping into the field 'vector' of each element -specified in second argument. +If a device implements both MSI and MSI-X capabilities, it can +run in either MSI mode or MSI-X mode but not both simultaneously. +This is a requirement of the PCI spec, and it is enforced by the +PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or +pci_enable_msix() when MSI is already enabled will result in an error. +If a device driver wishes to switch between MSI and MSI-X at runtime, +it must first quiesce the device, then switch it back to pin-interrupt +mode, before calling pci_enable_msi() or pci_enable_msix() and resuming +operation. This is not expected to be a common operation but may be +useful for debugging or testing during development. -To return back to its default mode, a device driver should always call -pci_disable_msix() to undo the effect of pci_enable_msix(). Note that -a device driver should always call free_irq() on all MSI-X vectors it -has done request_irq() on before calling pci_disable_msix(). Failure -to do so results in a BUG_ON() and a device will be left with MSI-X -enabled and leaks its vectors. Otherwise, the PCI subsystem switches a -device function's interrupt mode from MSI-X mode to legacy mode and -marks all allocated MSI-X vectors as unused. +4.5 Considerations when using MSIs -Once being marked as unused, there is no guarantee that the PCI -subsystem will reserve these MSI-X vectors for a device. Depending on -the availability of current PCI vector resources and the number of -MSI/MSI-X requests from other drivers, these MSI-X vectors may be -re-assigned. +4.5.1 Choosing between MSI-X and MSI -For the case where the PCI subsystem re-assigned these MSI-X vectors -to other drivers, a request to switch back to MSI-X mode may result -being assigned with another set of MSI-X vectors or a failure if no -more vectors are available. +If your device supports both MSI-X and MSI capabilities, you should use +the MSI-X facilities in preference to the MSI facilities. As mentioned +above, MSI-X supports any number of interrupts between 1 and 2048. +In constrast, MSI is restricted to a maximum of 32 interrupts (and +must be a power of two). In addition, the MSI interrupt vectors must +be allocated consecutively, so the system may not be able to allocate +as many vectors for MSI as it could for MSI-X. On some platforms, MSI +interrupts must all be targetted at the same set of CPUs whereas MSI-X +interrupts can all be targetted at different CPUs. -5.4 Handling function implementing both MSI and MSI-X capabilities +4.5.2 Spinlocks -For the case where a function implements both MSI and MSI-X -capabilities, the PCI subsystem enables a device to run either in MSI -mode or MSI-X mode but not both. A device driver determines whether it -wants MSI or MSI-X enabled on its hardware device. Once a device -driver requests for MSI, for example, it is prohibited from requesting -MSI-X; in other words, a device driver is not permitted to ping-pong -between MSI mod MSI-X mode during a run-time. +Most device drivers have a per-device spinlock which is taken in the +interrupt handler. With pin-based interrupts or a single MSI, it is not +necessary to disable interrupts (Linux guarantees the same interrupt will +not be re-entered). If a device uses multiple interrupts, the driver +must disable interrupts while the lock is held. If the device sends +a different interrupt, the driver will deadlock trying to recursively +acquire the spinlock. -5.5 Hardware requirements for MSI/MSI-X support +There are two solutions. The first is to take the lock with +spin_lock_irqsave() or spin_lock_irq() (see +Documentation/DocBook/kernel-locking). The second is to specify +IRQF_DISABLED to request_irq() so that the kernel runs the entire +interrupt routine with interrupts disabled. -MSI/MSI-X support requires support from both system hardware and -individual hardware device functions. +If your MSI interrupt routine does not hold the lock for the whole time +it is running, the first solution may be best. The second solution is +normally preferred as it avoids making two transitions from interrupt +disabled to enabled and back again. -5.5.1 Required x86 hardware support +4.6 How to tell whether MSI/MSI-X is enabled on a device -Since the target of MSI address is the local APIC CPU, enabling -MSI/MSI-X support in the Linux kernel is dependent on whether existing -system hardware supports local APIC. Users should verify that their -system supports local APIC operation by testing that it runs when -CONFIG_X86_LOCAL_APIC=y. +Using 'lspci -v' (as root) may show some devices with "MSI", "Message +Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities +has an 'Enable' flag which will be followed with either "+" (enabled) +or "-" (disabled). -In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; -however, in UP environment, users must manually set -CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting -CONFIG_PCI_MSI enables the VECTOR based scheme and the option for -MSI-capable device drivers to selectively enable MSI/MSI-X. -Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X -vector is allocated new during runtime and MSI/MSI-X support does not -depend on BIOS support. This key independency enables MSI/MSI-X -support on future IOxAPIC free platforms. +5. MSI quirks -5.5.2 Device hardware support +Several PCI chipsets or devices are known not to support MSIs. +The PCI stack provides three ways to disable MSIs: -The hardware device function supports MSI by indicating the -MSI/MSI-X capability structure on its PCI capability list. By -default, this capability structure will not be initialized by -the kernel to enable MSI during the system boot. In other words, -the device function is running on its default pin assertion mode. -Note that in many cases the hardware supporting MSI have bugs, -which may result in system hangs. The software driver of specific -MSI-capable hardware is responsible for deciding whether to call -pci_enable_msi or not. A return of zero indicates the kernel -successfully initialized the MSI/MSI-X capability structure of the -device function. The device function is now running on MSI/MSI-X mode. +1. globally +2. on all devices behind a specific bridge +3. on a single device -5.6 How to tell whether MSI/MSI-X is enabled on device function +5.1. Disabling MSIs globally -At the driver level, a return of zero from the function call of -pci_enable_msi()/pci_enable_msix() indicates to a device driver that -its device function is initialized successfully and ready to run in -MSI/MSI-X mode. +Some host chipsets simply don't support MSIs properly. If we're +lucky, the manufacturer knows this and has indicated it in the ACPI +FADT table. In this case, Linux will automatically disable MSIs. +Some boards don't include this information in the table and so we have +to detect them ourselves. The complete list of these is found near the +quirk_disable_all_msi() function in drivers/pci/quirks.c. -At the user level, users can use the command 'cat /proc/interrupts' -to display the vectors allocated for devices and their interrupt -MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is -enabled on a SCSI Adaptec 39320D Ultra320 controller. +If you have a board which has problems with MSIs, you can pass pci=nomsi +on the kernel command line to disable MSIs on all devices. It would be +in your best interests to report the problem to linux-pci@vger.kernel.org +including a full 'lspci -v' so we can add the quirks to the kernel. - CPU0 CPU1 - 0: 324639 0 IO-APIC-edge timer - 1: 1186 0 IO-APIC-edge i8042 - 2: 0 0 XT-PIC cascade - 12: 2797 0 IO-APIC-edge i8042 - 14: 6543 0 IO-APIC-edge ide0 - 15: 1 0 IO-APIC-edge ide1 -169: 0 0 IO-APIC-level uhci-hcd -185: 0 0 IO-APIC-level uhci-hcd -193: 138 10 PCI-MSI aic79xx -201: 30 0 PCI-MSI aic79xx -225: 30 0 IO-APIC-level aic7xxx -233: 30 0 IO-APIC-level aic7xxx -NMI: 0 0 -LOC: 324553 325068 -ERR: 0 -MIS: 0 +5.2. Disabling MSIs below a bridge -6. MSI quirks +Some PCI bridges are not able to route MSIs between busses properly. +In this case, MSIs must be disabled on all devices behind the bridge. -Several PCI chipsets or devices are known to not support MSI. -The PCI stack provides 3 possible levels of MSI disabling: -* on a single device -* on all devices behind a specific bridge -* globally +Some bridges allow you to enable MSIs by changing some bits in their +PCI configuration space (especially the Hypertransport chipsets such +as the nVidia nForce and Serverworks HT2000). As with host chipsets, +Linux mostly knows about them and automatically enables MSIs if it can. +If you have a bridge which Linux doesn't yet know about, you can enable +MSIs in configuration space using whatever method you know works, then +enable MSIs on that bridge by doing: -6.1. Disabling MSI on a single device + echo 1 > /sys/bus/pci/devices/$bridge/msi_bus -Under some circumstances it might be required to disable MSI on a -single device. This may be achieved by either not calling pci_enable_msi() -or all, or setting the pci_dev->no_msi flag before (most of the time -in a quirk). +where $bridge is the PCI address of the bridge you've enabled (eg +0000:00:0e.0). -6.2. Disabling MSI below a bridge +To disable MSIs, echo 0 instead of 1. Changing this value should be +done with caution as it can break interrupt handling for all devices +below this bridge. -The vast majority of MSI quirks are required by PCI bridges not -being able to route MSI between busses. In this case, MSI have to be -disabled on all devices behind this bridge. It is achieves by setting -the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge -subordinate bus. There is no need to set the same flag on bridges that -are below the broken bridge. When pci_enable_msi() is called to enable -MSI on a device, pci_msi_supported() takes care of checking the NO_MSI -flag in all parent busses of the device. +Again, please notify linux-pci@vger.kernel.org of any bridges that need +special handling. -Some bridges actually support dynamic MSI support enabling/disabling -by changing some bits in their PCI configuration space (especially -the Hypertransport chipsets such as the nVidia nForce and Serverworks -HT2000). It may then be required to update the NO_MSI flag on the -corresponding devices in the sysfs hierarchy. To enable MSI support -on device "0000:00:0e", do: +5.3. Disabling MSIs on a single device - echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus +Some devices are known to have faulty MSI implementations. Usually this +is handled in the individual device driver but occasionally it's necessary +to handle this with a quirk. Some drivers have an option to disable use +of MSI. While this is a convenient workaround for the driver author, +it is not good practise, and should not be emulated. -To disable MSI support, echo 0 instead of 1. Note that it should be -used with caution since changing this value might break interrupts. +5.4. Finding why MSIs are disabled on a device -6.3. Disabling MSI globally +From the above three sections, you can see that there are many reasons +why MSIs may not be enabled for a given device. Your first step should +be to examine your dmesg carefully to determine whether MSIs are enabled +for your machine. You should also check your .config to be sure you +have enabled CONFIG_PCI_MSI. -Some extreme cases may require to disable MSI globally on the system. -For now, the only known case is a Serverworks PCI-X chipsets (MSI are -not supported on several busses that are not all connected to the -chipset in the Linux PCI hierarchy). In the vast majority of other -cases, disabling only behind a specific bridge is enough. +Then, 'lspci -t' gives the list of bridges above a device. Reading +/sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1) +or disabled (0). If 0 is found in any of the msi_bus files belonging +to bridges between the PCI root and the device, MSIs are disabled. -For debugging purpose, the user may also pass pci=nomsi on the kernel -command-line to explicitly disable MSI globally. But, once the appro- -priate quirks are added to the kernel, this option should not be -required anymore. - -6.4. Finding why MSI cannot be enabled on a device - -Assuming that MSI are not enabled on a device, you should look at -dmesg to find messages that quirks may output when disabling MSI -on some devices, some bridges or even globally. -Then, lspci -t gives the list of bridges above a device. Reading -/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI -are enabled (1) or disabled (0). In 0 is found in a single bridge -msi_bus file above the device, MSI cannot be enabled. - -7. FAQ - -Q1. Are there any limitations on using the MSI? - -A1. If the PCI device supports MSI and conforms to the -specification and the platform supports the APIC local bus, -then using MSI should work. - -Q2. Will it work on all the Pentium processors (P3, P4, Xeon, -AMD processors)? In P3 IPI's are transmitted on the APIC local -bus and in P4 and Xeon they are transmitted on the system -bus. Are there any implications with this? - -A2. MSI support enables a PCI device sending an inbound -memory write (0xfeexxxxx as target address) on its PCI bus -directly to the FSB. Since the message address has a -redirection hint bit cleared, it should work. - -Q3. The target address 0xfeexxxxx will be translated by the -Host Bridge into an interrupt message. Are there any -limitations on the chipsets such as Intel 8xx, Intel e7xxx, -or VIA? - -A3. If these chipsets support an inbound memory write with -target address set as 0xfeexxxxx, as conformed to PCI -specification 2.3 or latest, then it should work. - -Q4. From the driver point of view, if the MSI is lost because -of errors occurring during inbound memory write, then it may -wait forever. Is there a mechanism for it to recover? - -A4. Since the target of the transaction is an inbound memory -write, all transaction termination conditions (Retry, -Master-Abort, Target-Abort, or normal completion) are -supported. A device sending an MSI must abide by all the PCI -rules and conditions regarding that inbound memory write. So, -if a retry is signaled it must retry, etc... We believe that -the recommendation for Abort is also a retry (refer to PCI -specification 2.3 or latest). +It is also worth checking the device driver to see whether it supports MSIs. +For example, it may contain calls to pci_enable_msi(), pci_enable_msix() or +pci_enable_msi_block(). From 24d27553390c69d11cdbd930d635193956fc295f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:06 -0400 Subject: [PATCH 109/478] PCI MSI: Replace 'type' with 'is_msix' By changing from a 5-bit field to a 1-bit field, we free up some bits that can be used by a later patch. Also rearrange the fields for better packing on 64-bit platforms (reducing the size of msi_desc from 72 bytes to 64 bytes). Signed-off-by: Matthew Wilcox Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 115 +++++++++++++++----------------------------- include/linux/msi.h | 4 +- 2 files changed, 41 insertions(+), 78 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index dceea56f7342..b3db4388f974 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -111,20 +111,10 @@ static void msix_flush_writes(struct irq_desc *desc) entry = get_irq_desc_msi(desc); BUG_ON(!entry || !entry->dev); - switch (entry->msi_attrib.type) { - case PCI_CAP_ID_MSI: - /* nothing to do */ - break; - case PCI_CAP_ID_MSIX: - { + if (entry->msi_attrib.is_msix) { int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; readl(entry->mask_base + offset); - break; - } - default: - BUG(); - break; } } @@ -143,32 +133,23 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag) entry = get_irq_desc_msi(desc); BUG_ON(!entry || !entry->dev); - switch (entry->msi_attrib.type) { - case PCI_CAP_ID_MSI: - if (entry->msi_attrib.maskbit) { - int pos; - u32 mask_bits; - - pos = (long)entry->mask_base; - pci_read_config_dword(entry->dev, pos, &mask_bits); - mask_bits &= ~(mask); - mask_bits |= flag & mask; - pci_write_config_dword(entry->dev, pos, mask_bits); - } else { - return 0; - } - break; - case PCI_CAP_ID_MSIX: - { + if (entry->msi_attrib.is_msix) { int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; writel(flag, entry->mask_base + offset); readl(entry->mask_base + offset); - break; - } - default: - BUG(); - break; + } else { + int pos; + u32 mask_bits; + + if (!entry->msi_attrib.maskbit) + return 0; + + pos = (long)entry->mask_base; + pci_read_config_dword(entry->dev, pos, &mask_bits); + mask_bits &= ~mask; + mask_bits |= flag & mask; + pci_write_config_dword(entry->dev, pos, mask_bits); } entry->msi_attrib.masked = !!flag; return 1; @@ -177,9 +158,14 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag) void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) { struct msi_desc *entry = get_irq_desc_msi(desc); - switch(entry->msi_attrib.type) { - case PCI_CAP_ID_MSI: - { + if (entry->msi_attrib.is_msix) { + void __iomem *base = entry->mask_base + + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + + msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); + } else { struct pci_dev *dev = entry->dev; int pos = entry->msi_attrib.pos; u16 data; @@ -195,21 +181,6 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) pci_read_config_word(dev, msi_data_reg(pos, 0), &data); } msg->data = data; - break; - } - case PCI_CAP_ID_MSIX: - { - void __iomem *base; - base = entry->mask_base + - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; - - msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); - break; - } - default: - BUG(); } } @@ -223,9 +194,17 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg) void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) { struct msi_desc *entry = get_irq_desc_msi(desc); - switch (entry->msi_attrib.type) { - case PCI_CAP_ID_MSI: - { + if (entry->msi_attrib.is_msix) { + void __iomem *base; + base = entry->mask_base + + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + + writel(msg->address_lo, + base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + writel(msg->address_hi, + base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET); + } else { struct pci_dev *dev = entry->dev; int pos = entry->msi_attrib.pos; @@ -240,23 +219,6 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) pci_write_config_word(dev, msi_data_reg(pos, 0), msg->data); } - break; - } - case PCI_CAP_ID_MSIX: - { - void __iomem *base; - base = entry->mask_base + - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; - - writel(msg->address_lo, - base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - writel(msg->address_hi, - base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET); - break; - } - default: - BUG(); } entry->msg = *msg; } @@ -393,7 +355,7 @@ static int msi_capability_init(struct pci_dev *dev) if (!entry) return -ENOMEM; - entry->msi_attrib.type = PCI_CAP_ID_MSI; + entry->msi_attrib.is_msix = 0; entry->msi_attrib.is_64 = is_64bit_address(control); entry->msi_attrib.entry_nr = 0; entry->msi_attrib.maskbit = is_mask_bit_support(control); @@ -475,7 +437,7 @@ static int msix_capability_init(struct pci_dev *dev, break; j = entries[i].entry; - entry->msi_attrib.type = PCI_CAP_ID_MSIX; + entry->msi_attrib.is_msix = 1; entry->msi_attrib.is_64 = 1; entry->msi_attrib.entry_nr = j; entry->msi_attrib.maskbit = 1; @@ -619,12 +581,13 @@ void pci_msi_shutdown(struct pci_dev* dev) struct irq_desc *desc = irq_to_desc(dev->irq); msi_set_mask_bits(desc, mask, ~mask); } - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) + if (!entry->dev || entry->msi_attrib.is_msix) return; /* Restore dev->irq to its default pin-assertion irq */ dev->irq = entry->msi_attrib.default_irq; } + void pci_disable_msi(struct pci_dev* dev) { struct msi_desc *entry; @@ -635,7 +598,7 @@ void pci_disable_msi(struct pci_dev* dev) pci_msi_shutdown(dev); entry = list_entry(dev->msi_list.next, struct msi_desc, list); - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) + if (!entry->dev || entry->msi_attrib.is_msix) return; msi_free_irqs(dev); @@ -654,7 +617,7 @@ static int msi_free_irqs(struct pci_dev* dev) arch_teardown_msi_irqs(dev); list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) { - if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) { + if (entry->msi_attrib.is_msix) { writel(1, entry->mask_base + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); diff --git a/include/linux/msi.h b/include/linux/msi.h index d2b8a1e8ca11..9c5ce214fbf4 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -20,13 +20,13 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); struct msi_desc { struct { - __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */ + __u8 is_msix : 1; __u8 maskbit : 1; /* mask-pending bit supported ? */ __u8 masked : 1; __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ __u8 pos; /* Location of the msi capability */ - __u32 maskbits_mask; /* mask bits mask */ __u16 entry_nr; /* specific enabled entry */ + __u32 maskbits_mask; /* mask bits mask */ unsigned default_irq; /* default pre-assigned irq */ }msi_attrib; From 379f5327a86f7822a51ec7d088a085167724df75 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:07 -0400 Subject: [PATCH 110/478] PCI MSI: msi_desc->dev is always initialised By passing the pci_dev into alloc_msi_entry() we can be sure that the ->dev entry is always assigned and so we don't need to check it. Also, we used kzalloc() so we don't need to initialise ->irq to 0. Signed-off-by: Matthew Wilcox Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index b3db4388f974..a658c0f34e16 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -110,7 +110,7 @@ static void msix_flush_writes(struct irq_desc *desc) struct msi_desc *entry; entry = get_irq_desc_msi(desc); - BUG_ON(!entry || !entry->dev); + BUG_ON(!entry); if (entry->msi_attrib.is_msix) { int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; @@ -132,7 +132,7 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag) struct msi_desc *entry; entry = get_irq_desc_msi(desc); - BUG_ON(!entry || !entry->dev); + BUG_ON(!entry); if (entry->msi_attrib.is_msix) { int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; @@ -248,19 +248,16 @@ void unmask_msi_irq(unsigned int irq) static int msi_free_irqs(struct pci_dev* dev); -static struct msi_desc* alloc_msi_entry(void) +static struct msi_desc *alloc_msi_entry(struct pci_dev *dev) { - struct msi_desc *entry; - - entry = kzalloc(sizeof(struct msi_desc), GFP_KERNEL); - if (!entry) + struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) return NULL; - INIT_LIST_HEAD(&entry->list); - entry->irq = 0; - entry->dev = NULL; + INIT_LIST_HEAD(&desc->list); + desc->dev = dev; - return entry; + return desc; } static void pci_intx_for_msi(struct pci_dev *dev, int enable) @@ -351,7 +348,7 @@ static int msi_capability_init(struct pci_dev *dev) pos = pci_find_capability(dev, PCI_CAP_ID_MSI); pci_read_config_word(dev, msi_control_reg(pos), &control); /* MSI Entry Initialization */ - entry = alloc_msi_entry(); + entry = alloc_msi_entry(dev); if (!entry) return -ENOMEM; @@ -362,7 +359,6 @@ static int msi_capability_init(struct pci_dev *dev) entry->msi_attrib.masked = 1; entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ entry->msi_attrib.pos = pos; - entry->dev = dev; if (entry->msi_attrib.maskbit) { unsigned int base, maskbits, temp; @@ -432,7 +428,7 @@ static int msix_capability_init(struct pci_dev *dev, /* MSI-X Table Initialization */ for (i = 0; i < nvec; i++) { - entry = alloc_msi_entry(); + entry = alloc_msi_entry(dev); if (!entry) break; @@ -444,7 +440,6 @@ static int msix_capability_init(struct pci_dev *dev, entry->msi_attrib.masked = 1; entry->msi_attrib.default_irq = dev->irq; entry->msi_attrib.pos = pos; - entry->dev = dev; entry->mask_base = base; list_add_tail(&entry->list, &dev->msi_list); @@ -581,7 +576,7 @@ void pci_msi_shutdown(struct pci_dev* dev) struct irq_desc *desc = irq_to_desc(dev->irq); msi_set_mask_bits(desc, mask, ~mask); } - if (!entry->dev || entry->msi_attrib.is_msix) + if (entry->msi_attrib.is_msix) return; /* Restore dev->irq to its default pin-assertion irq */ @@ -598,7 +593,7 @@ void pci_disable_msi(struct pci_dev* dev) pci_msi_shutdown(dev); entry = list_entry(dev->msi_list.next, struct msi_desc, list); - if (!entry->dev || entry->msi_attrib.is_msix) + if (entry->msi_attrib.is_msix) return; msi_free_irqs(dev); From 264d9caaa1c574c0274b019a810abfe957391005 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:08 -0400 Subject: [PATCH 111/478] PCI MSI: Use mask_pos instead of mask_base when appropriate MSI interrupts have a mask_pos where MSI-X have a mask_base. Use a transparent union to get rid of some ugly casts. Signed-off-by: Matthew Wilcox Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 5 ++--- include/linux/msi.h | 5 ++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index a658c0f34e16..fcde04df6dfe 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -145,7 +145,7 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag) if (!entry->msi_attrib.maskbit) return 0; - pos = (long)entry->mask_base; + pos = entry->mask_pos; pci_read_config_dword(entry->dev, pos, &mask_bits); mask_bits &= ~mask; mask_bits |= flag & mask; @@ -363,8 +363,7 @@ static int msi_capability_init(struct pci_dev *dev) unsigned int base, maskbits, temp; base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64); - entry->mask_base = (void __iomem *)(long)base; - + entry->mask_pos = base; /* All MSIs are unmasked by default, Mask them all */ pci_read_config_dword(dev, base, &maskbits); temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1); diff --git a/include/linux/msi.h b/include/linux/msi.h index 9c5ce214fbf4..5025ca4d91e4 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -33,7 +33,10 @@ struct msi_desc { unsigned int irq; struct list_head list; - void __iomem *mask_base; + union { + void __iomem *mask_base; + u8 mask_pos; + }; struct pci_dev *dev; /* Last set MSI message */ From f2440d9acbe866b917b16cc0f927366341ce9215 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:09 -0400 Subject: [PATCH 112/478] PCI MSI: Refactor interrupt masking code Since most of the callers already know whether they have an MSI or an MSI-X capability, split msi_set_mask_bits() into msi_mask_irq() and msix_mask_irq(). The only callers which don't (mask_msi_irq() and unmask_msi_irq()) can share code in msi_set_mask_bit(). This then becomes the only caller of msix_flush_writes(), so we can inline it. The flushing read can be to any address that belongs to the device, so we can eliminate the calculation too. We can also get rid of maskbits_mask from struct msi_desc and simply recalculate it on the rare occasion that we need it. The single-bit 'masked' element is replaced by a copy of the 32-bit 'masked' register, so this patch does not affect the size of msi_desc. Signed-off-by: Matthew Wilcox Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 155 +++++++++++++++++++++----------------------- include/linux/msi.h | 5 +- 2 files changed, 77 insertions(+), 83 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index fcde04df6dfe..adcc78242571 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -105,17 +105,14 @@ static inline __attribute_const__ u32 msi_mask(unsigned x) return (1 << (1 << x)) - 1; } -static void msix_flush_writes(struct irq_desc *desc) +static inline __attribute_const__ u32 msi_capable_mask(u16 control) { - struct msi_desc *entry; + return msi_mask((control >> 1) & 7); +} - entry = get_irq_desc_msi(desc); - BUG_ON(!entry); - if (entry->msi_attrib.is_msix) { - int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; - readl(entry->mask_base + offset); - } +static inline __attribute_const__ u32 msi_enabled_mask(u16 control) +{ + return msi_mask((control >> 4) & 7); } /* @@ -127,32 +124,57 @@ static void msix_flush_writes(struct irq_desc *desc) * Returns 1 if it succeeded in masking the interrupt and 0 if the device * doesn't support MSI masking. */ -static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag) +static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) { - struct msi_desc *entry; + u32 mask_bits = desc->masked; - entry = get_irq_desc_msi(desc); - BUG_ON(!entry); - if (entry->msi_attrib.is_msix) { - int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; - writel(flag, entry->mask_base + offset); - readl(entry->mask_base + offset); + if (!desc->msi_attrib.maskbit) + return; + + mask_bits &= ~mask; + mask_bits |= flag; + pci_write_config_dword(desc->dev, desc->mask_pos, mask_bits); + desc->masked = mask_bits; +} + +/* + * This internal function does not flush PCI writes to the device. + * All users must ensure that they read from the device before either + * assuming that the device state is up to date, or returning out of this + * file. This saves a few milliseconds when initialising devices with lots + * of MSI-X interrupts. + */ +static void msix_mask_irq(struct msi_desc *desc, u32 flag) +{ + u32 mask_bits = desc->masked; + unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; + mask_bits &= ~1; + mask_bits |= flag; + writel(mask_bits, desc->mask_base + offset); + desc->masked = mask_bits; +} + +static void msi_set_mask_bit(unsigned irq, u32 flag) +{ + struct msi_desc *desc = get_irq_msi(irq); + + if (desc->msi_attrib.is_msix) { + msix_mask_irq(desc, flag); + readl(desc->mask_base); /* Flush write to device */ } else { - int pos; - u32 mask_bits; - - if (!entry->msi_attrib.maskbit) - return 0; - - pos = entry->mask_pos; - pci_read_config_dword(entry->dev, pos, &mask_bits); - mask_bits &= ~mask; - mask_bits |= flag & mask; - pci_write_config_dword(entry->dev, pos, mask_bits); + msi_mask_irq(desc, 1, flag); } - entry->msi_attrib.masked = !!flag; - return 1; +} + +void mask_msi_irq(unsigned int irq) +{ + msi_set_mask_bit(irq, 1); +} + +void unmask_msi_irq(unsigned int irq) +{ + msi_set_mask_bit(irq, 0); } void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) @@ -230,22 +252,6 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg) write_msi_msg_desc(desc, msg); } -void mask_msi_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - msi_set_mask_bits(desc, 1, 1); - msix_flush_writes(desc); -} - -void unmask_msi_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - msi_set_mask_bits(desc, 1, 0); - msix_flush_writes(desc); -} - static int msi_free_irqs(struct pci_dev* dev); static struct msi_desc *alloc_msi_entry(struct pci_dev *dev) @@ -281,13 +287,9 @@ static void __pci_restore_msi_state(struct pci_dev *dev) pci_intx_for_msi(dev, 0); msi_set_enable(dev, 0); write_msi_msg(dev->irq, &entry->msg); - if (entry->msi_attrib.maskbit) { - struct irq_desc *desc = irq_to_desc(dev->irq); - msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask, - entry->msi_attrib.masked); - } pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); + msi_mask_irq(entry, msi_capable_mask(control), entry->masked); control &= ~PCI_MSI_FLAGS_QSIZE; control |= PCI_MSI_FLAGS_ENABLE; pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); @@ -307,9 +309,8 @@ static void __pci_restore_msix_state(struct pci_dev *dev) msix_set_enable(dev, 0); list_for_each_entry(entry, &dev->msi_list, list) { - struct irq_desc *desc = irq_to_desc(entry->irq); write_msi_msg(entry->irq, &entry->msg); - msi_set_mask_bits(desc, 1, entry->msi_attrib.masked); + msix_mask_irq(entry, entry->masked); } BUG_ON(list_empty(&dev->msi_list)); @@ -342,6 +343,7 @@ static int msi_capability_init(struct pci_dev *dev) struct msi_desc *entry; int pos, ret; u16 control; + unsigned mask; msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ @@ -356,21 +358,16 @@ static int msi_capability_init(struct pci_dev *dev) entry->msi_attrib.is_64 = is_64bit_address(control); entry->msi_attrib.entry_nr = 0; entry->msi_attrib.maskbit = is_mask_bit_support(control); - entry->msi_attrib.masked = 1; entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ entry->msi_attrib.pos = pos; - if (entry->msi_attrib.maskbit) { - unsigned int base, maskbits, temp; - base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64); - entry->mask_pos = base; - /* All MSIs are unmasked by default, Mask them all */ - pci_read_config_dword(dev, base, &maskbits); - temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1); - maskbits |= temp; - pci_write_config_dword(dev, base, maskbits); - entry->msi_attrib.maskbits_mask = temp; - } + entry->mask_pos = msi_mask_bits_reg(pos, entry->msi_attrib.is_64); + /* All MSIs are unmasked by default, Mask them all */ + if (entry->msi_attrib.maskbit) + pci_read_config_dword(dev, entry->mask_pos, &entry->masked); + mask = msi_capable_mask(control); + msi_mask_irq(entry, mask, mask); + list_add_tail(&entry->list, &dev->msi_list); /* Configure MSI capability structure */ @@ -435,11 +432,12 @@ static int msix_capability_init(struct pci_dev *dev, entry->msi_attrib.is_msix = 1; entry->msi_attrib.is_64 = 1; entry->msi_attrib.entry_nr = j; - entry->msi_attrib.maskbit = 1; - entry->msi_attrib.masked = 1; entry->msi_attrib.default_irq = dev->irq; entry->msi_attrib.pos = pos; entry->mask_base = base; + entry->masked = readl(base + j * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + msix_mask_irq(entry, 1); list_add_tail(&entry->list, &dev->msi_list); } @@ -556,9 +554,11 @@ int pci_enable_msi(struct pci_dev* dev) } EXPORT_SYMBOL(pci_enable_msi); -void pci_msi_shutdown(struct pci_dev* dev) +void pci_msi_shutdown(struct pci_dev *dev) { - struct msi_desc *entry; + struct msi_desc *desc; + u32 mask; + u16 ctrl; if (!pci_msi_enable || !dev || !dev->msi_enabled) return; @@ -568,18 +568,13 @@ void pci_msi_shutdown(struct pci_dev* dev) dev->msi_enabled = 0; BUG_ON(list_empty(&dev->msi_list)); - entry = list_entry(dev->msi_list.next, struct msi_desc, list); - /* Return the the pci reset with msi irqs unmasked */ - if (entry->msi_attrib.maskbit) { - u32 mask = entry->msi_attrib.maskbits_mask; - struct irq_desc *desc = irq_to_desc(dev->irq); - msi_set_mask_bits(desc, mask, ~mask); - } - if (entry->msi_attrib.is_msix) - return; + desc = list_first_entry(&dev->msi_list, struct msi_desc, list); + pci_read_config_word(dev, desc->msi_attrib.pos + PCI_MSI_FLAGS, &ctrl); + mask = msi_capable_mask(ctrl); + msi_mask_irq(desc, mask, ~mask); /* Restore dev->irq to its default pin-assertion irq */ - dev->irq = entry->msi_attrib.default_irq; + dev->irq = desc->msi_attrib.default_irq; } void pci_disable_msi(struct pci_dev* dev) diff --git a/include/linux/msi.h b/include/linux/msi.h index 5025ca4d91e4..37c1bbe546e5 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -22,14 +22,13 @@ struct msi_desc { struct { __u8 is_msix : 1; __u8 maskbit : 1; /* mask-pending bit supported ? */ - __u8 masked : 1; __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ __u8 pos; /* Location of the msi capability */ __u16 entry_nr; /* specific enabled entry */ - __u32 maskbits_mask; /* mask bits mask */ unsigned default_irq; /* default pre-assigned irq */ - }msi_attrib; + } msi_attrib; + u32 masked; /* mask bits */ unsigned int irq; struct list_head list; From 1c8d7b0a562da06d3ebe83f01b1ed553205d1ae4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:10 -0400 Subject: [PATCH 113/478] PCI MSI: Add support for multiple MSI Add the new API pci_enable_msi_block() to allow drivers to request multiple MSI and reimplement pci_enable_msi in terms of pci_enable_msi_block. Ensure that the architecture back ends don't have to know about multiple MSI. Signed-off-by: Matthew Wilcox Signed-off-by: Jesse Barnes --- Documentation/PCI/MSI-HOWTO.txt | 45 +++++++++++++--- arch/powerpc/kernel/msi.c | 4 ++ arch/x86/kernel/io_apic.c | 4 ++ drivers/pci/msi.c | 91 +++++++++++++++++++++++---------- drivers/pci/msi.h | 6 --- include/linux/msi.h | 1 + include/linux/pci.h | 6 ++- 7 files changed, 116 insertions(+), 41 deletions(-) diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1c02431f1d1a..9494f6dc38eb 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -94,15 +94,48 @@ This function should be called before the driver calls request_irq() since enabling MSIs disables the pin-based IRQ and the driver will not receive interrupts on the old interrupt. -4.2.2 pci_disable_msi +4.2.2 pci_enable_msi_block + +int pci_enable_msi_block(struct pci_dev *dev, int count) + +This variation on the above call allows a device driver to request multiple +MSIs. The MSI specification only allows interrupts to be allocated in +powers of two, up to a maximum of 2^5 (32). + +If this function returns 0, it has succeeded in allocating at least as many +interrupts as the driver requested (it may have allocated more in order +to satisfy the power-of-two requirement). In this case, the function +enables MSI on this device and updates dev->irq to be the lowest of +the new interrupts assigned to it. The other interrupts assigned to +the device are in the range dev->irq to dev->irq + count - 1. + +If this function returns a negative number, it indicates an error and +the driver should not attempt to request any more MSI interrupts for +this device. If this function returns a positive number, it will be +less than 'count' and indicate the number of interrupts that could have +been allocated. In neither case will the irq value have been +updated, nor will the device have been switched into MSI mode. + +The device driver must decide what action to take if +pci_enable_msi_block() returns a value less than the number asked for. +Some devices can make use of fewer interrupts than the maximum they +request; in this case the driver should call pci_enable_msi_block() +again. Note that it is not guaranteed to succeed, even when the +'count' has been reduced to the value returned from a previous call to +pci_enable_msi_block(). This is because there are multiple constraints +on the number of vectors that can be allocated; pci_enable_msi_block() +will return as soon as it finds any constraint that doesn't allow the +call to succeed. + +4.2.3 pci_disable_msi void pci_disable_msi(struct pci_dev *dev) -This function should be used to undo the effect of pci_enable_msi(). -Calling it restores dev->irq to the pin-based interrupt number and frees -the previously allocated message signaled interrupt(s). The interrupt -may subsequently be assigned to another device, so drivers should not -cache the value of dev->irq. +This function should be used to undo the effect of pci_enable_msi() or +pci_enable_msi_block(). Calling it restores dev->irq to the pin-based +interrupt number and frees the previously allocated message signaled +interrupt(s). The interrupt may subsequently be assigned to another +device, so drivers should not cache the value of dev->irq. A device driver must always call free_irq() on the interrupt(s) for which it has called request_irq() before calling this function. diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c index 3bb7d3dd28be..0c16e2a854e5 100644 --- a/arch/powerpc/kernel/msi.c +++ b/arch/powerpc/kernel/msi.c @@ -19,6 +19,10 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type) return -ENOSYS; } + /* PowerPC doesn't support multiple MSI yet */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + if (ppc_md.msi_check_device) { pr_debug("msi: Using platform check routine.\n"); return ppc_md.msi_check_device(dev, nvec, type); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bc7ac4da90d7..a09549a6321b 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3510,6 +3510,10 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int index = 0; #endif + /* x86 doesn't support multiple MSI yet */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index adcc78242571..6f2e6295e773 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -40,6 +40,13 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) struct msi_desc *entry; int ret; + /* + * If an architecture wants to support multiple MSI, it needs to + * override arch_setup_msi_irqs() + */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + list_for_each_entry(entry, &dev->msi_list, list) { ret = arch_setup_msi_irq(dev, entry); if (ret < 0) @@ -58,8 +65,12 @@ void arch_teardown_msi_irqs(struct pci_dev *dev) struct msi_desc *entry; list_for_each_entry(entry, &dev->msi_list, list) { - if (entry->irq != 0) - arch_teardown_msi_irq(entry->irq); + int i, nvec; + if (entry->irq == 0) + continue; + nvec = 1 << entry->msi_attrib.multiple; + for (i = 0; i < nvec; i++) + arch_teardown_msi_irq(entry->irq + i); } } #endif @@ -163,7 +174,8 @@ static void msi_set_mask_bit(unsigned irq, u32 flag) msix_mask_irq(desc, flag); readl(desc->mask_base); /* Flush write to device */ } else { - msi_mask_irq(desc, 1, flag); + unsigned offset = irq - desc->dev->irq; + msi_mask_irq(desc, 1 << offset, flag << offset); } } @@ -229,6 +241,12 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) } else { struct pci_dev *dev = entry->dev; int pos = entry->msi_attrib.pos; + u16 msgctl; + + pci_read_config_word(dev, msi_control_reg(pos), &msgctl); + msgctl &= ~PCI_MSI_FLAGS_QSIZE; + msgctl |= entry->msi_attrib.multiple << 4; + pci_write_config_word(dev, msi_control_reg(pos), msgctl); pci_write_config_dword(dev, msi_lower_address_reg(pos), msg->address_lo); @@ -291,7 +309,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev) pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); msi_mask_irq(entry, msi_capable_mask(control), entry->masked); control &= ~PCI_MSI_FLAGS_QSIZE; - control |= PCI_MSI_FLAGS_ENABLE; + control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE; pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); } @@ -332,13 +350,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state); /** * msi_capability_init - configure device's MSI capability structure * @dev: pointer to the pci_dev data structure of MSI device function + * @nvec: number of interrupts to allocate * - * Setup the MSI capability structure of device function with a single - * MSI irq, regardless of device function is capable of handling - * multiple messages. A return of zero indicates the successful setup - * of an entry zero with the new MSI irq or non-zero for otherwise. - **/ -static int msi_capability_init(struct pci_dev *dev) + * Setup the MSI capability structure of the device with the requested + * number of interrupts. A return value of zero indicates the successful + * setup of an entry with the new MSI irq. A negative return value indicates + * an error, and a positive return value indicates the number of interrupts + * which could have been allocated. + */ +static int msi_capability_init(struct pci_dev *dev, int nvec) { struct msi_desc *entry; int pos, ret; @@ -371,7 +391,7 @@ static int msi_capability_init(struct pci_dev *dev) list_add_tail(&entry->list, &dev->msi_list); /* Configure MSI capability structure */ - ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI); + ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI); if (ret) { msi_free_irqs(dev); return ret; @@ -524,35 +544,48 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type) } /** - * pci_enable_msi - configure device's MSI capability structure - * @dev: pointer to the pci_dev data structure of MSI device function + * pci_enable_msi_block - configure device's MSI capability structure + * @dev: device to configure + * @nvec: number of interrupts to configure * - * Setup the MSI capability structure of device function with - * a single MSI irq upon its software driver call to request for - * MSI mode enabled on its hardware device function. A return of zero - * indicates the successful setup of an entry zero with the new MSI - * irq or non-zero for otherwise. - **/ -int pci_enable_msi(struct pci_dev* dev) + * Allocate IRQs for a device with the MSI capability. + * This function returns a negative errno if an error occurs. If it + * is unable to allocate the number of interrupts requested, it returns + * the number of interrupts it might be able to allocate. If it successfully + * allocates at least the number of interrupts requested, it returns 0 and + * updates the @dev's irq member to the lowest new interrupt number; the + * other interrupt numbers allocated to this device are consecutive. + */ +int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec) { - int status; + int status, pos, maxvec; + u16 msgctl; - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI); + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (!pos) + return -EINVAL; + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); + maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1); + if (nvec > maxvec) + return maxvec; + + status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI); if (status) return status; WARN_ON(!!dev->msi_enabled); - /* Check whether driver already requested for MSI-X irqs */ + /* Check whether driver already requested MSI-X irqs */ if (dev->msix_enabled) { dev_info(&dev->dev, "can't enable MSI " "(MSI-X already enabled)\n"); return -EINVAL; } - status = msi_capability_init(dev); + + status = msi_capability_init(dev, nvec); return status; } -EXPORT_SYMBOL(pci_enable_msi); +EXPORT_SYMBOL(pci_enable_msi_block); void pci_msi_shutdown(struct pci_dev *dev) { @@ -599,8 +632,12 @@ static int msi_free_irqs(struct pci_dev* dev) struct msi_desc *entry, *tmp; list_for_each_entry(entry, &dev->msi_list, list) { - if (entry->irq) - BUG_ON(irq_has_action(entry->irq)); + int i, nvec; + if (!entry->irq) + continue; + nvec = 1 << entry->msi_attrib.multiple; + for (i = 0; i < nvec; i++) + BUG_ON(irq_has_action(entry->irq + i)); } arch_teardown_msi_irqs(dev); diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h index 3898f5237144..71f4df2ef654 100644 --- a/drivers/pci/msi.h +++ b/drivers/pci/msi.h @@ -20,14 +20,8 @@ #define msi_mask_bits_reg(base, is64bit) \ ( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4) #define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE -#define multi_msi_capable(control) \ - (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) -#define multi_msi_enable(control, num) \ - control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE); #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) -#define msi_enable(control, num) multi_msi_enable(control, num); \ - control |= PCI_MSI_FLAGS_ENABLE #define msix_table_offset_reg(base) (base + 0x04) #define msix_pba_offset_reg(base) (base + 0x08) diff --git a/include/linux/msi.h b/include/linux/msi.h index 37c1bbe546e5..6991ab5b24d1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -21,6 +21,7 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); struct msi_desc { struct { __u8 is_msix : 1; + __u8 multiple: 3; /* log2 number of messages */ __u8 maskbit : 1; /* mask-pending bit supported ? */ __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ __u8 pos; /* Location of the msi capability */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 7baf2a5db12a..1f6c5ddaae36 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -789,7 +789,7 @@ struct msix_entry { #ifndef CONFIG_PCI_MSI -static inline int pci_enable_msi(struct pci_dev *dev) +static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec) { return -1; } @@ -824,7 +824,7 @@ static inline int pci_msi_enabled(void) return 0; } #else -extern int pci_enable_msi(struct pci_dev *dev); +extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec); extern void pci_msi_shutdown(struct pci_dev *dev); extern void pci_disable_msi(struct pci_dev *dev); extern int pci_msix_table_size(struct pci_dev *dev); @@ -846,6 +846,8 @@ static inline int pcie_aspm_enabled(void) extern int pcie_aspm_enabled(void); #endif +#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1) + #ifdef CONFIG_HT_IRQ /* The functions a driver should call */ int ht_create_irq(struct pci_dev *dev, int idx); From 32a9a682bef2f6fce7026bd94d1ce20028b0e52d Mon Sep 17 00:00:00 2001 From: Yuji Shimada Date: Mon, 16 Mar 2009 17:13:39 +0900 Subject: [PATCH 114/478] PCI: allow assignment of memory resources with a specified alignment This patch allows memory resources to be assigned with a specified alignment at boot-time or run-time. The patch is useful when we use PCI pass-through, because page-aligned memory resources are required to securely share PCI resources with guest drivers. If you want to assign the resource at boot time, please set "pci=resource_alignment=" boot parameter. This is format of "pci=resource_alignment=" boot parameter: [@][:]:.[; ...] Specifies alignment and device to reassign aligned memory resources. If is not specified, PAGE_SIZE is used as alignment. PCI-PCI bridge can be specified, if resource windows need to be expanded. This is example: pci=resource_alignment=20@07:00.0;18@0f:00.0;00:1d.7 If you want to assign the resource at run-time, please set "/sys/bus/pci/resource_alignment" file, and hot-remove the device and hot-add the device. For this purpose, fakephp or PCI hotplug interfaces can be used. The format of "/sys/bus/pci/resource_alignment" file is the same with boot parameter. You can use "," instead of ";". For example: # cd /sys/bus/pci # echo -n 20@12:00.0 > resource_alignment # echo 1 > devices/0000:12:00.0/remove # echo 1 > rescan Reviewed-by: Alex Chiang Reviewed-by: Yu Zhao Signed-off-by: Yuji Shimada Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 9 +++ drivers/pci/pci.c | 120 ++++++++++++++++++++++++++++ drivers/pci/pci.h | 6 ++ drivers/pci/quirks.c | 60 ++++++++++++++ drivers/pci/setup-res.c | 15 ++++ 5 files changed, 210 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c7c441e7930e..1754fedc531c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1760,6 +1760,15 @@ and is between 256 and 4096 characters. It is defined in the file cbmemsize=nn[KMG] The fixed amount of bus space which is reserved for the CardBus bridge's memory window. The default value is 64 megabytes. + resource_alignment= + Format: + [@][:]:.[; ...] + Specifies alignment and device to reassign + aligned memory resources. + If is not specified, + PAGE_SIZE is used as alignment. + PCI-PCI bridge can be specified, if resource + windows need to be expanded. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8310dc2f943b..a35a8b2ba631 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -20,6 +20,8 @@ #include #include #include /* isa_dma_bridge_buggy */ +#include +#include #include "pci.h" unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT; @@ -2370,6 +2372,121 @@ int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type) return 0; } +#define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE +static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0}; +spinlock_t resource_alignment_lock = SPIN_LOCK_UNLOCKED; + +/** + * pci_specified_resource_alignment - get resource alignment specified by user. + * @dev: the PCI device to get + * + * RETURNS: Resource alignment if it is specified. + * Zero if it is not specified. + */ +resource_size_t pci_specified_resource_alignment(struct pci_dev *dev) +{ + int seg, bus, slot, func, align_order, count; + resource_size_t align = 0; + char *p; + + spin_lock(&resource_alignment_lock); + p = resource_alignment_param; + while (*p) { + count = 0; + if (sscanf(p, "%d%n", &align_order, &count) == 1 && + p[count] == '@') { + p += count + 1; + } else { + align_order = -1; + } + if (sscanf(p, "%x:%x:%x.%x%n", + &seg, &bus, &slot, &func, &count) != 4) { + seg = 0; + if (sscanf(p, "%x:%x.%x%n", + &bus, &slot, &func, &count) != 3) { + /* Invalid format */ + printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n", + p); + break; + } + } + p += count; + if (seg == pci_domain_nr(dev->bus) && + bus == dev->bus->number && + slot == PCI_SLOT(dev->devfn) && + func == PCI_FUNC(dev->devfn)) { + if (align_order == -1) { + align = PAGE_SIZE; + } else { + align = 1 << align_order; + } + /* Found */ + break; + } + if (*p != ';' && *p != ',') { + /* End of param or invalid format */ + break; + } + p++; + } + spin_unlock(&resource_alignment_lock); + return align; +} + +/** + * pci_is_reassigndev - check if specified PCI is target device to reassign + * @dev: the PCI device to check + * + * RETURNS: non-zero for PCI device is a target device to reassign, + * or zero is not. + */ +int pci_is_reassigndev(struct pci_dev *dev) +{ + return (pci_specified_resource_alignment(dev) != 0); +} + +ssize_t pci_set_resource_alignment_param(const char *buf, size_t count) +{ + if (count > RESOURCE_ALIGNMENT_PARAM_SIZE - 1) + count = RESOURCE_ALIGNMENT_PARAM_SIZE - 1; + spin_lock(&resource_alignment_lock); + strncpy(resource_alignment_param, buf, count); + resource_alignment_param[count] = '\0'; + spin_unlock(&resource_alignment_lock); + return count; +} + +ssize_t pci_get_resource_alignment_param(char *buf, size_t size) +{ + size_t count; + spin_lock(&resource_alignment_lock); + count = snprintf(buf, size, "%s", resource_alignment_param); + spin_unlock(&resource_alignment_lock); + return count; +} + +static ssize_t pci_resource_alignment_show(struct bus_type *bus, char *buf) +{ + return pci_get_resource_alignment_param(buf, PAGE_SIZE); +} + +static ssize_t pci_resource_alignment_store(struct bus_type *bus, + const char *buf, size_t count) +{ + return pci_set_resource_alignment_param(buf, count); +} + +BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show, + pci_resource_alignment_store); + +static int __init pci_resource_alignment_sysfs_init(void) +{ + return bus_create_file(&pci_bus_type, + &bus_attr_resource_alignment); +} + +late_initcall(pci_resource_alignment_sysfs_init); + static void __devinit pci_no_domains(void) { #ifdef CONFIG_PCI_DOMAINS @@ -2418,6 +2535,9 @@ static int __init pci_setup(char *str) pci_cardbus_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "cbmemsize=", 10)) { pci_cardbus_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "resource_alignment=", 19)) { + pci_set_resource_alignment_param(str + 19, + strlen(str + 19)); } else { printk(KERN_ERR "PCI: Unknown option `%s'\n", str); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 07c0aa5275e6..2cd1cba7236f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -195,4 +195,10 @@ static inline int pci_ari_enabled(struct pci_bus *bus) return bus->self && bus->self->ari_enabled; } +#ifdef CONFIG_PCI_QUIRKS +extern int pci_is_reassigndev(struct pci_dev *dev); +resource_size_t pci_specified_resource_alignment(struct pci_dev *dev); +extern void pci_disable_bridge_window(struct pci_dev *dev); +#endif + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 5aa2afb23ef9..50233818a763 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "pci.h" int isa_dma_bridge_buggy; @@ -34,6 +35,65 @@ int pcie_mch_quirk; EXPORT_SYMBOL(pcie_mch_quirk); #ifdef CONFIG_PCI_QUIRKS +/* + * This quirk function disables the device and releases resources + * which is specified by kernel's boot parameter 'pci=resource_alignment='. + * It also rounds up size to specified alignment. + * Later on, the kernel will assign page-aligned memory resource back + * to that device. + */ +static void __devinit quirk_resource_alignment(struct pci_dev *dev) +{ + int i; + struct resource *r; + resource_size_t align, size; + + if (!pci_is_reassigndev(dev)) + return; + + if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL && + (dev->class >> 8) == PCI_CLASS_BRIDGE_HOST) { + dev_warn(&dev->dev, + "Can't reassign resources to host bridge.\n"); + return; + } + + dev_info(&dev->dev, "Disabling device and release resources.\n"); + pci_disable_device(dev); + + align = pci_specified_resource_alignment(dev); + for (i=0; i < PCI_BRIDGE_RESOURCES; i++) { + r = &dev->resource[i]; + if (!(r->flags & IORESOURCE_MEM)) + continue; + size = resource_size(r); + if (size < align) { + size = align; + dev_info(&dev->dev, + "Rounding up size of resource #%d to %#llx.\n", + i, (unsigned long long)size); + } + r->end = size - 1; + r->start = 0; + } + /* Need to disable bridge's resource window, + * to enable the kernel to reassign new resource + * window later on. + */ + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE && + (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) { + r = &dev->resource[i]; + if (!(r->flags & IORESOURCE_MEM)) + continue; + r->end = resource_size(r) - 1; + r->start = 0; + } + pci_disable_bridge_window(dev); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, quirk_resource_alignment); + /* The Mellanox Tavor device gives false positive parity errors * Mark this device with a broken_parity_status, to allow * PCI scanning code to "skip" this now blacklisted device. diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 32e8d88a4619..3039fcb86afc 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -120,6 +120,21 @@ int pci_claim_resource(struct pci_dev *dev, int resource) return err; } +#ifdef CONFIG_PCI_QUIRKS +void pci_disable_bridge_window(struct pci_dev *dev) +{ + dev_dbg(&dev->dev, "Disabling bridge window.\n"); + + /* MMIO Base/Limit */ + pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0); + + /* Prefetchable MMIO Base/Limit */ + pci_write_config_dword(dev, PCI_PREF_LIMIT_UPPER32, 0); + pci_write_config_dword(dev, PCI_PREF_MEMORY_BASE, 0x0000fff0); + pci_write_config_dword(dev, PCI_PREF_BASE_UPPER32, 0xffffffff); +} +#endif /* CONFIG_PCI_QUIRKS */ + int pci_assign_resource(struct pci_dev *dev, int resno) { struct pci_bus *bus = dev->bus; From 6a3b3e26803fc823058fbb05abb5e0d92a52e1bd Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 15 Mar 2009 20:14:37 +0100 Subject: [PATCH 115/478] PCI: Use kzalloc() in pci_create_bus() Signed-off-by: Geert Uytterhoeven Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 23362e8c696f..9e7d642e66b0 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1114,7 +1114,7 @@ struct pci_bus * pci_create_bus(struct device *parent, if (!b) return NULL; - dev = kmalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev){ kfree(b); return NULL; @@ -1133,7 +1133,6 @@ struct pci_bus * pci_create_bus(struct device *parent, list_add_tail(&b->node, &pci_root_buses); up_write(&pci_bus_sem); - memset(dev, 0, sizeof(*dev)); dev->parent = parent; dev->release = pci_release_bus_bridge_dev; dev_set_name(dev, "pci%04x:%02x", pci_domain_nr(b), bus); From 9efb5fe1b80a45130ba659ba755f24534c83301b Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Mon, 9 Mar 2009 12:08:15 -0600 Subject: [PATCH 116/478] PCI: PCIe portdrv: eliminate double kfree in remove path Commit 55633af3 (PCIe portdrv: Use driver data to simplify code) added a kfree of the driver private data in pcie_port_device_remove but forgot to remove the old kfree from pcie_portdrv_remove. Acked-by: Rafael J. Wysocki Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv_pci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index a61f4930d676..b924e2463f85 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -94,7 +94,6 @@ static void pcie_portdrv_remove (struct pci_dev *dev) { pcie_port_device_remove(dev); pci_disable_device(dev); - kfree(pci_get_drvdata(dev)); } static int error_detected_iter(struct device *device, void *data) From 745be2e700cdddd5da4e402854a484242c3628df Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sat, 7 Mar 2009 21:46:49 -0700 Subject: [PATCH 117/478] PCIe: portdrv: call pci_disable_device during remove The PCIe port driver calls pci_enable_device when registering ports, but never calls pci_disable_device during removal. Acked-by: Rafael J. Wysocki Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/pcie/portdrv_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 5a5bfe7cdf5f..e39982503863 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -473,6 +473,7 @@ void pcie_port_device_remove(struct pci_dev *dev) struct pcie_port_data *port_data = pci_get_drvdata(dev); device_for_each_child(&dev->dev, NULL, remove_iter); + pci_disable_device(dev); switch (port_data->port_irq_mode) { case PCIE_PORT_MSIX_MODE: From dfadd9edff498d767008edc6b2a6e86a7a19934d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Mar 2009 21:35:37 -0700 Subject: [PATCH 118/478] PCI/x86: detect host bridge config space size w/o using quirks Many host bridges support a 4k config space, so check them directy instead of using quirks to add them. We only need to do this extra check for host bridges at this point, because only host bridges are known to have extended address space without also having a PCI-X/PCI-E caps. Other devices with this property could be done with quirks (if there are any). As a bonus, we can remove the quirks for AMD host bridges with family 10h and 11h since they're not needed any more. With this patch, we can get correct pci cfg size of new Intel CPUs/IOHs with host bridges. Signed-off-by: Yinghai Lu Acked-by: H. Peter Anvin Reviewed-by: Matthew Wilcox Cc: Signed-off-by: Jesse Barnes --- arch/x86/pci/fixup.c | 20 -------------------- drivers/pci/probe.c | 9 ++++++++- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 7d388d5cf548..096b0ed0713e 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -494,26 +494,6 @@ static void __devinit pci_siemens_interrupt_controller(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015, pci_siemens_interrupt_controller); -/* - * Regular PCI devices have 256 bytes, but AMD Family 10h/11h CPUs have - * 4096 bytes configuration space for each function of their processor - * configuration space. - */ -static void amd_cpu_pci_cfg_space_size(struct pci_dev *dev) -{ - dev->cfg_size = pci_cfg_space_size_ext(dev); -} -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1300, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1301, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1302, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1303, amd_cpu_pci_cfg_space_size); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1304, amd_cpu_pci_cfg_space_size); - /* * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from * confusing the PCI engine: diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 9e7d642e66b0..579a56c8181f 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -847,6 +847,11 @@ int pci_cfg_space_size(struct pci_dev *dev) { int pos; u32 status; + u16 class; + + class = dev->class >> 8; + if (class == PCI_CLASS_BRIDGE_HOST) + return pci_cfg_space_size_ext(dev); pos = pci_find_capability(dev, PCI_CAP_ID_EXP); if (!pos) { @@ -936,7 +941,6 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) dev->multifunction = !!(hdr_type & 0x80); dev->vendor = l & 0xffff; dev->device = (l >> 16) & 0xffff; - dev->cfg_size = pci_cfg_space_size(dev); dev->error_state = pci_channel_io_normal; set_pcie_port_type(dev); @@ -952,6 +956,9 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) return NULL; } + /* need to have dev->class ready */ + dev->cfg_size = pci_cfg_space_size(dev); + return dev; } From 217f45de3d2f68b6d0b646bb4f2a155a71648396 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 4 Mar 2009 05:57:05 +0000 Subject: [PATCH 119/478] PCI: expose boot VGA device via sysfs. X really would like to know which VGA device was considered the boot device by the system. The x86 PCI fixups have support for discovering this but we provide no way to expose it to userspace. This adds a sysfs file per VGA class device which has the value 0 for non the boot device or unknown, and 1 if the VGA device is the boot device. Acked-by: Greg Kroah-Hartman Signed-off-by: Dave Airlie Signed-off-by: Jesse Barnes --- drivers/pci/pci-sysfs.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 1c8929801400..ec7a175dcbaf 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -240,6 +240,17 @@ struct device_attribute pci_dev_attrs[] = { __ATTR_NULL, }; +static ssize_t +boot_vga_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", + !!(pdev->resource[PCI_ROM_RESOURCE].flags & + IORESOURCE_ROM_SHADOW)); +} +struct device_attribute vga_attr = __ATTR_RO(boot_vga); + static ssize_t pci_read_config(struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) @@ -899,18 +910,27 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev) pdev->rom_attr = attr; } + if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) { + retval = device_create_file(&pdev->dev, &vga_attr); + if (retval) + goto err_rom_file; + } + /* add platform-specific attributes */ retval = pcibios_add_platform_entries(pdev); if (retval) - goto err_rom_file; + goto err_vga_file; /* add sysfs entries for various capabilities */ retval = pci_create_capabilities_sysfs(pdev); if (retval) - goto err_rom_file; + goto err_vga_file; return 0; +err_vga_file: + if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) + device_remove_file(&pdev->dev, &vga_attr); err_rom_file: if (rom_size) { sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr); From 8293b0f629095efbe7c7e3f9b437f8c040c19eb5 Mon Sep 17 00:00:00 2001 From: David O'Shea Date: Mon, 2 Mar 2009 09:51:13 +0100 Subject: [PATCH 120/478] PCI: Compaq Evo D510 SMBus quirk using USB instead of VGA On the Compaq Evo D510 SFF/CMT, a PCI quirk activated the SMBus device based on detection of the on-board VGA controller, but the on-board VGA is disabled if an AGP card is inserted, so look for one of the USB controllers instead. Signed-off-by: David O'Shea Signed-off-by: Jean Delvare Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 9 +++++++-- include/linux/pci_ids.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 50233818a763..7ddcfc65e790 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1186,10 +1186,15 @@ static void __init asus_hides_smbus_hostbridge(struct pci_dev *dev) * its on-board VGA controller */ asus_hides_smbus = 1; } - else if (dev->device == PCI_DEVICE_ID_INTEL_82845G_IG) + else if (dev->device == PCI_DEVICE_ID_INTEL_82801DB_2) switch(dev->subsystem_device) { case 0x00b8: /* Compaq Evo D510 CMT */ case 0x00b9: /* Compaq Evo D510 SFF */ + /* Motherboard doesn't have Host bridge + * subvendor/subdevice IDs and on-board VGA + * controller is disabled if an AGP card is + * inserted, therefore checking USB UHCI + * Controller #1 */ asus_hides_smbus = 1; } else if (dev->device == PCI_DEVICE_ID_INTEL_82815_CGC) @@ -1214,7 +1219,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82855GM_HB, as DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82915GM_HB, asus_hides_smbus_hostbridge); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82810_IG3, asus_hides_smbus_hostbridge); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82845G_IG, asus_hides_smbus_hostbridge); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_2, asus_hides_smbus_hostbridge); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82815_CGC, asus_hides_smbus_hostbridge); static void asus_hides_smbus_lpc(struct pci_dev *dev) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index aca8c458aa8a..3ddf8beabdf8 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2373,6 +2373,7 @@ #define PCI_DEVICE_ID_INTEL_82801CA_12 0x248c #define PCI_DEVICE_ID_INTEL_82801DB_0 0x24c0 #define PCI_DEVICE_ID_INTEL_82801DB_1 0x24c1 +#define PCI_DEVICE_ID_INTEL_82801DB_2 0x24c2 #define PCI_DEVICE_ID_INTEL_82801DB_3 0x24c3 #define PCI_DEVICE_ID_INTEL_82801DB_5 0x24c5 #define PCI_DEVICE_ID_INTEL_82801DB_6 0x24c6 From d1b054da8f599905f3c18a218961dcf17f9d5f13 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:11 +0800 Subject: [PATCH 121/478] PCI: initialize and release SR-IOV capability If a device has the SR-IOV capability, initialize it (set the ARI Capable Hierarchy in the lowest numbered PF if necessary; calculate the System Page Size for the VF MMIO, probe the VF Offset, Stride and BARs). A lock for the VF bus allocation is also initialized if a PF is the lowest numbered PF. Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/Kconfig | 10 +++ drivers/pci/Makefile | 2 + drivers/pci/iov.c | 182 +++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.c | 7 ++ drivers/pci/pci.h | 37 ++++++++ drivers/pci/probe.c | 4 + include/linux/pci.h | 11 +++ include/linux/pci_regs.h | 33 +++++++ 8 files changed, 286 insertions(+) create mode 100644 drivers/pci/iov.c diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2a4501dd2515..fdc864f9cf23 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -59,3 +59,13 @@ config HT_IRQ This allows native hypertransport devices to use interrupts. If unsure say Y. + +config PCI_IOV + bool "PCI IOV support" + depends on PCI + help + I/O Virtualization is a PCI feature supported by some devices + which allows them to create virtual devices which share their + physical resources. + + If unsure, say N. diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 3d07ce24f6a8..ba6af162fd39 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -29,6 +29,8 @@ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o +obj-$(CONFIG_PCI_IOV) += iov.o + # # Some architectures use the generic PCI setup functions # diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c new file mode 100644 index 000000000000..66cc414ed15f --- /dev/null +++ b/drivers/pci/iov.c @@ -0,0 +1,182 @@ +/* + * drivers/pci/iov.c + * + * Copyright (C) 2009 Intel Corporation, Yu Zhao + * + * PCI Express I/O Virtualization (IOV) support. + * Single Root IOV 1.0 + */ + +#include +#include +#include +#include +#include "pci.h" + + +static int sriov_init(struct pci_dev *dev, int pos) +{ + int i; + int rc; + int nres; + u32 pgsz; + u16 ctrl, total, offset, stride; + struct pci_sriov *iov; + struct resource *res; + struct pci_dev *pdev; + + if (dev->pcie_type != PCI_EXP_TYPE_RC_END && + dev->pcie_type != PCI_EXP_TYPE_ENDPOINT) + return -ENODEV; + + pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl); + if (ctrl & PCI_SRIOV_CTRL_VFE) { + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0); + ssleep(1); + } + + pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total); + if (!total) + return 0; + + ctrl = 0; + list_for_each_entry(pdev, &dev->bus->devices, bus_list) + if (pdev->is_physfn) + goto found; + + pdev = NULL; + if (pci_ari_enabled(dev->bus)) + ctrl |= PCI_SRIOV_CTRL_ARI; + +found: + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl); + pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total); + pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset); + pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride); + if (!offset || (total > 1 && !stride)) + return -EIO; + + pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz); + i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0; + pgsz &= ~((1 << i) - 1); + if (!pgsz) + return -EIO; + + pgsz &= ~(pgsz - 1); + pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz); + + nres = 0; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = dev->resource + PCI_IOV_RESOURCES + i; + i += __pci_read_base(dev, pci_bar_unknown, res, + pos + PCI_SRIOV_BAR + i * 4); + if (!res->flags) + continue; + if (resource_size(res) & (PAGE_SIZE - 1)) { + rc = -EIO; + goto failed; + } + res->end = res->start + resource_size(res) * total - 1; + nres++; + } + + iov = kzalloc(sizeof(*iov), GFP_KERNEL); + if (!iov) { + rc = -ENOMEM; + goto failed; + } + + iov->pos = pos; + iov->nres = nres; + iov->ctrl = ctrl; + iov->total = total; + iov->offset = offset; + iov->stride = stride; + iov->pgsz = pgsz; + iov->self = dev; + pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap); + pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link); + + if (pdev) + iov->dev = pci_dev_get(pdev); + else { + iov->dev = dev; + mutex_init(&iov->lock); + } + + dev->sriov = iov; + dev->is_physfn = 1; + + return 0; + +failed: + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = dev->resource + PCI_IOV_RESOURCES + i; + res->flags = 0; + } + + return rc; +} + +static void sriov_release(struct pci_dev *dev) +{ + if (dev == dev->sriov->dev) + mutex_destroy(&dev->sriov->lock); + else + pci_dev_put(dev->sriov->dev); + + kfree(dev->sriov); + dev->sriov = NULL; +} + +/** + * pci_iov_init - initialize the IOV capability + * @dev: the PCI device + * + * Returns 0 on success, or negative on failure. + */ +int pci_iov_init(struct pci_dev *dev) +{ + int pos; + + if (!dev->is_pcie) + return -ENODEV; + + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV); + if (pos) + return sriov_init(dev, pos); + + return -ENODEV; +} + +/** + * pci_iov_release - release resources used by the IOV capability + * @dev: the PCI device + */ +void pci_iov_release(struct pci_dev *dev) +{ + if (dev->is_physfn) + sriov_release(dev); +} + +/** + * pci_iov_resource_bar - get position of the SR-IOV BAR + * @dev: the PCI device + * @resno: the resource number + * @type: the BAR type to be filled in + * + * Returns position of the BAR encapsulated in the SR-IOV capability. + */ +int pci_iov_resource_bar(struct pci_dev *dev, int resno, + enum pci_bar_type *type) +{ + if (resno < PCI_IOV_RESOURCES || resno > PCI_IOV_RESOURCE_END) + return 0; + + BUG_ON(!dev->is_physfn); + + *type = pci_bar_unknown; + + return dev->sriov->pos + PCI_SRIOV_BAR + + 4 * (resno - PCI_IOV_RESOURCES); +} diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index a35a8b2ba631..2b3201ec2b05 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2360,12 +2360,19 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags) */ int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type) { + int reg; + if (resno < PCI_ROM_RESOURCE) { *type = pci_bar_unknown; return PCI_BASE_ADDRESS_0 + 4 * resno; } else if (resno == PCI_ROM_RESOURCE) { *type = pci_bar_mem32; return dev->rom_base_reg; + } else if (resno < PCI_BRIDGE_RESOURCES) { + /* device specific resource */ + reg = pci_iov_resource_bar(dev, resno, type); + if (reg) + return reg; } dev_err(&dev->dev, "BAR: invalid resource #%d\n", resno); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2cd1cba7236f..7d5327c986f5 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -201,4 +201,41 @@ resource_size_t pci_specified_resource_alignment(struct pci_dev *dev); extern void pci_disable_bridge_window(struct pci_dev *dev); #endif +/* Single Root I/O Virtualization */ +struct pci_sriov { + int pos; /* capability position */ + int nres; /* number of resources */ + u32 cap; /* SR-IOV Capabilities */ + u16 ctrl; /* SR-IOV Control */ + u16 total; /* total VFs associated with the PF */ + u16 offset; /* first VF Routing ID offset */ + u16 stride; /* following VF stride */ + u32 pgsz; /* page size for BAR alignment */ + u8 link; /* Function Dependency Link */ + struct pci_dev *dev; /* lowest numbered PF */ + struct pci_dev *self; /* this PF */ + struct mutex lock; /* lock for VF bus */ +}; + +#ifdef CONFIG_PCI_IOV +extern int pci_iov_init(struct pci_dev *dev); +extern void pci_iov_release(struct pci_dev *dev); +extern int pci_iov_resource_bar(struct pci_dev *dev, int resno, + enum pci_bar_type *type); +#else +static inline int pci_iov_init(struct pci_dev *dev) +{ + return -ENODEV; +} +static inline void pci_iov_release(struct pci_dev *dev) + +{ +} +static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno, + enum pci_bar_type *type) +{ + return 0; +} +#endif /* CONFIG_PCI_IOV */ + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 579a56c8181f..0471f6ea1466 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -785,6 +785,7 @@ static int pci_setup_device(struct pci_dev * dev) static void pci_release_capabilities(struct pci_dev *dev) { pci_vpd_release(dev); + pci_iov_release(dev); } /** @@ -979,6 +980,9 @@ static void pci_init_capabilities(struct pci_dev *dev) /* Alternative Routing-ID Forwarding */ pci_enable_ari(dev); + + /* Single Root I/O Virtualization */ + pci_iov_init(dev); } void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) diff --git a/include/linux/pci.h b/include/linux/pci.h index 1f6c5ddaae36..8ce2f2d9ab63 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -93,6 +93,12 @@ enum { /* #6: expansion ROM resource */ PCI_ROM_RESOURCE, + /* device specific resources */ +#ifdef CONFIG_PCI_IOV + PCI_IOV_RESOURCES, + PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1, +#endif + /* resources assigned to buses behind the bridge */ #define PCI_BRIDGE_RESOURCE_NUM 4 @@ -180,6 +186,7 @@ struct pci_cap_saved_state { struct pcie_link_state; struct pci_vpd; +struct pci_sriov; /* * The pci_dev structure is used to describe PCI devices. @@ -257,6 +264,7 @@ struct pci_dev { unsigned int is_managed:1; unsigned int is_pcie:1; unsigned int state_saved:1; + unsigned int is_physfn:1; pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ @@ -270,6 +278,9 @@ struct pci_dev { struct list_head msi_list; #endif struct pci_vpd *vpd; +#ifdef CONFIG_PCI_IOV + struct pci_sriov *sriov; /* SR-IOV capability related */ +#endif }; extern struct pci_dev *alloc_pci_dev(void); diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index b647a4df59fc..d4e663877f45 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -375,6 +375,7 @@ #define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */ #define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */ #define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */ +#define PCI_EXP_TYPE_RC_END 0x9 /* Root Complex Integrated Endpoint */ #define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ #define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ #define PCI_EXP_DEVCAP 4 /* Device capabilities */ @@ -498,6 +499,7 @@ #define PCI_EXT_CAP_ID_DSN 3 #define PCI_EXT_CAP_ID_PWR 4 #define PCI_EXT_CAP_ID_ARI 14 +#define PCI_EXT_CAP_ID_SRIOV 16 /* Advanced Error Reporting */ #define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ @@ -615,4 +617,35 @@ #define PCI_ARI_CTRL_ACS 0x0002 /* ACS Function Groups Enable */ #define PCI_ARI_CTRL_FG(x) (((x) >> 4) & 7) /* Function Group */ +/* Single Root I/O Virtualization */ +#define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */ +#define PCI_SRIOV_CAP_VFM 0x01 /* VF Migration Capable */ +#define PCI_SRIOV_CAP_INTR(x) ((x) >> 21) /* Interrupt Message Number */ +#define PCI_SRIOV_CTRL 0x08 /* SR-IOV Control */ +#define PCI_SRIOV_CTRL_VFE 0x01 /* VF Enable */ +#define PCI_SRIOV_CTRL_VFM 0x02 /* VF Migration Enable */ +#define PCI_SRIOV_CTRL_INTR 0x04 /* VF Migration Interrupt Enable */ +#define PCI_SRIOV_CTRL_MSE 0x08 /* VF Memory Space Enable */ +#define PCI_SRIOV_CTRL_ARI 0x10 /* ARI Capable Hierarchy */ +#define PCI_SRIOV_STATUS 0x0a /* SR-IOV Status */ +#define PCI_SRIOV_STATUS_VFM 0x01 /* VF Migration Status */ +#define PCI_SRIOV_INITIAL_VF 0x0c /* Initial VFs */ +#define PCI_SRIOV_TOTAL_VF 0x0e /* Total VFs */ +#define PCI_SRIOV_NUM_VF 0x10 /* Number of VFs */ +#define PCI_SRIOV_FUNC_LINK 0x12 /* Function Dependency Link */ +#define PCI_SRIOV_VF_OFFSET 0x14 /* First VF Offset */ +#define PCI_SRIOV_VF_STRIDE 0x16 /* Following VF Stride */ +#define PCI_SRIOV_VF_DID 0x1a /* VF Device ID */ +#define PCI_SRIOV_SUP_PGSIZE 0x1c /* Supported Page Sizes */ +#define PCI_SRIOV_SYS_PGSIZE 0x20 /* System Page Size */ +#define PCI_SRIOV_BAR 0x24 /* VF BAR0 */ +#define PCI_SRIOV_NUM_BARS 6 /* Number of VF BARs */ +#define PCI_SRIOV_VFM 0x3c /* VF Migration State Array Offset*/ +#define PCI_SRIOV_VFM_BIR(x) ((x) & 7) /* State BIR */ +#define PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7) /* State Offset */ +#define PCI_SRIOV_VFM_UA 0x0 /* Inactive.Unavailable */ +#define PCI_SRIOV_VFM_MI 0x1 /* Dormant.MigrateIn */ +#define PCI_SRIOV_VFM_MO 0x2 /* Active.MigrateOut */ +#define PCI_SRIOV_VFM_AV 0x3 /* Active.Available */ + #endif /* LINUX_PCI_REGS_H */ From 8c5cdb6adc6688b9b8fd82ea4a5cf4674dabad79 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:12 +0800 Subject: [PATCH 122/478] PCI: restore saved SR-IOV state Restore the volatile registers in the SR-IOV capability after the D3->D0 transition. Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/iov.c | 29 +++++++++++++++++++++++++++++ drivers/pci/pci.c | 1 + drivers/pci/pci.h | 4 ++++ 3 files changed, 34 insertions(+) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 66cc414ed15f..b121e47402fa 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -129,6 +129,25 @@ static void sriov_release(struct pci_dev *dev) dev->sriov = NULL; } +static void sriov_restore_state(struct pci_dev *dev) +{ + int i; + u16 ctrl; + struct pci_sriov *iov = dev->sriov; + + pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &ctrl); + if (ctrl & PCI_SRIOV_CTRL_VFE) + return; + + for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) + pci_update_resource(dev, i); + + pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz); + pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); + if (iov->ctrl & PCI_SRIOV_CTRL_VFE) + msleep(100); +} + /** * pci_iov_init - initialize the IOV capability * @dev: the PCI device @@ -180,3 +199,13 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno, return dev->sriov->pos + PCI_SRIOV_BAR + 4 * (resno - PCI_IOV_RESOURCES); } + +/** + * pci_restore_iov_state - restore the state of the IOV capability + * @dev: the PCI device + */ +void pci_restore_iov_state(struct pci_dev *dev) +{ + if (dev->is_physfn) + sriov_restore_state(dev); +} diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2b3201ec2b05..676bbcbc272b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -775,6 +775,7 @@ pci_restore_state(struct pci_dev *dev) } pci_restore_pcix_state(dev); pci_restore_msi_state(dev); + pci_restore_iov_state(dev); return 0; } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 7d5327c986f5..fd5ea4d445e8 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -222,6 +222,7 @@ extern int pci_iov_init(struct pci_dev *dev); extern void pci_iov_release(struct pci_dev *dev); extern int pci_iov_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type); +extern void pci_restore_iov_state(struct pci_dev *dev); #else static inline int pci_iov_init(struct pci_dev *dev) { @@ -236,6 +237,9 @@ static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno, { return 0; } +static inline void pci_restore_iov_state(struct pci_dev *dev) +{ +} #endif /* CONFIG_PCI_IOV */ #endif /* DRIVERS_PCI_H */ From a28724b0fb909d247229a70761c90bb37b13366a Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:13 +0800 Subject: [PATCH 123/478] PCI: reserve bus range for SR-IOV device Reserve the bus number range used by the Virtual Function when pcibios_assign_all_busses() returns true. Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/iov.c | 36 ++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 5 +++++ drivers/pci/probe.c | 3 +++ 3 files changed, 44 insertions(+) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index b121e47402fa..5ddfc09a8d3f 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -14,6 +14,18 @@ #include "pci.h" +static inline u8 virtfn_bus(struct pci_dev *dev, int id) +{ + return dev->bus->number + ((dev->devfn + dev->sriov->offset + + dev->sriov->stride * id) >> 8); +} + +static inline u8 virtfn_devfn(struct pci_dev *dev, int id) +{ + return (dev->devfn + dev->sriov->offset + + dev->sriov->stride * id) & 0xff; +} + static int sriov_init(struct pci_dev *dev, int pos) { int i; @@ -209,3 +221,27 @@ void pci_restore_iov_state(struct pci_dev *dev) if (dev->is_physfn) sriov_restore_state(dev); } + +/** + * pci_iov_bus_range - find bus range used by Virtual Function + * @bus: the PCI bus + * + * Returns max number of buses (exclude current one) used by Virtual + * Functions. + */ +int pci_iov_bus_range(struct pci_bus *bus) +{ + int max = 0; + u8 busnr; + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + if (!dev->is_physfn) + continue; + busnr = virtfn_bus(dev, dev->sriov->total - 1); + if (busnr > max) + max = busnr; + } + + return max ? max - bus->number : 0; +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index fd5ea4d445e8..5c29cb2b8e63 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -223,6 +223,7 @@ extern void pci_iov_release(struct pci_dev *dev); extern int pci_iov_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type); extern void pci_restore_iov_state(struct pci_dev *dev); +extern int pci_iov_bus_range(struct pci_bus *bus); #else static inline int pci_iov_init(struct pci_dev *dev) { @@ -240,6 +241,10 @@ static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno, static inline void pci_restore_iov_state(struct pci_dev *dev) { } +static inline int pci_iov_bus_range(struct pci_bus *bus) +{ + return 0; +} #endif /* CONFIG_PCI_IOV */ #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0471f6ea1466..0ecdaea3915f 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1085,6 +1085,9 @@ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) for (devfn = 0; devfn < 0x100; devfn += 8) pci_scan_slot(bus, devfn); + /* Reserve buses for SR-IOV capability. */ + max += pci_iov_bus_range(bus); + /* * After performing arch-dependent fixup of the bus, look behind * all PCI-to-PCI bridges on this bus. From 480b93b7837fb3cf0579a42f4953ac463a5b9e1e Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:14 +0800 Subject: [PATCH 124/478] PCI: centralize device setup code Move the device setup stuff into pci_setup_device() which will be used to setup the Virtual Function later. Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/pci.h | 1 + drivers/pci/probe.c | 78 +++++++++++++++++++++++---------------------- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 5c29cb2b8e63..f4fc10fc5872 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -178,6 +178,7 @@ enum pci_bar_type { pci_bar_mem64, /* A 64-bit memory BAR */ }; +extern int pci_setup_device(struct pci_dev *dev); extern int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, struct resource *res, unsigned int reg); extern int pci_resource_bar(struct pci_dev *dev, int resno, diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0ecdaea3915f..943c49a7842c 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -674,6 +674,19 @@ static void pci_read_irq(struct pci_dev *dev) dev->irq = irq; } +static void set_pcie_port_type(struct pci_dev *pdev) +{ + int pos; + u16 reg16; + + pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (!pos) + return; + pdev->is_pcie = 1; + pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); + pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; +} + #define LEGACY_IO_RESOURCE (IORESOURCE_IO | IORESOURCE_PCI_FIXED) /** @@ -683,12 +696,34 @@ static void pci_read_irq(struct pci_dev *dev) * Initialize the device structure with information about the device's * vendor,class,memory and IO-space addresses,IRQ lines etc. * Called at initialisation of the PCI subsystem and by CardBus services. - * Returns 0 on success and -1 if unknown type of device (not normal, bridge - * or CardBus). + * Returns 0 on success and negative if unknown type of device (not normal, + * bridge or CardBus). */ -static int pci_setup_device(struct pci_dev * dev) +int pci_setup_device(struct pci_dev *dev) { u32 class; + u8 hdr_type; + struct pci_slot *slot; + + if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type)) + return -EIO; + + dev->sysdata = dev->bus->sysdata; + dev->dev.parent = dev->bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->hdr_type = hdr_type & 0x7f; + dev->multifunction = !!(hdr_type & 0x80); + dev->cfg_size = pci_cfg_space_size(dev); + dev->error_state = pci_channel_io_normal; + set_pcie_port_type(dev); + + list_for_each_entry(slot, &dev->bus->slots, list) + if (PCI_SLOT(dev->devfn) == slot->number) + dev->slot = slot; + + /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) + set this higher, assuming the system even supports it. */ + dev->dma_mask = 0xffffffff; dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus), dev->bus->number, PCI_SLOT(dev->devfn), @@ -708,7 +743,6 @@ static int pci_setup_device(struct pci_dev * dev) /* Early fixups, before probing the BARs */ pci_fixup_device(pci_fixup_early, dev); - class = dev->class >> 8; switch (dev->hdr_type) { /* header type */ case PCI_HEADER_TYPE_NORMAL: /* standard header */ @@ -770,7 +804,7 @@ static int pci_setup_device(struct pci_dev * dev) default: /* unknown header */ dev_err(&dev->dev, "unknown header type %02x, " "ignoring device\n", dev->hdr_type); - return -1; + return -EIO; bad: dev_err(&dev->dev, "ignoring class %02x (doesn't match header " @@ -804,19 +838,6 @@ static void pci_release_dev(struct device *dev) kfree(pci_dev); } -static void set_pcie_port_type(struct pci_dev *pdev) -{ - int pos; - u16 reg16; - - pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); - if (!pos) - return; - pdev->is_pcie = 1; - pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); - pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; -} - /** * pci_cfg_space_size - get the configuration space size of the PCI device. * @dev: PCI device @@ -897,9 +918,7 @@ EXPORT_SYMBOL(alloc_pci_dev); static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) { struct pci_dev *dev; - struct pci_slot *slot; u32 l; - u8 hdr_type; int delay = 1; if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l)) @@ -926,33 +945,16 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) } } - if (pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type)) - return NULL; - dev = alloc_pci_dev(); if (!dev) return NULL; dev->bus = bus; - dev->sysdata = bus->sysdata; - dev->dev.parent = bus->bridge; - dev->dev.bus = &pci_bus_type; dev->devfn = devfn; - dev->hdr_type = hdr_type & 0x7f; - dev->multifunction = !!(hdr_type & 0x80); dev->vendor = l & 0xffff; dev->device = (l >> 16) & 0xffff; - dev->error_state = pci_channel_io_normal; - set_pcie_port_type(dev); - list_for_each_entry(slot, &bus->slots, list) - if (PCI_SLOT(devfn) == slot->number) - dev->slot = slot; - - /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) - set this higher, assuming the system even supports it. */ - dev->dma_mask = 0xffffffff; - if (pci_setup_device(dev) < 0) { + if (pci_setup_device(dev)) { kfree(dev); return NULL; } From dd7cc44d0bcec5e9c42fe52e88dc254ae62eac8d Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:15 +0800 Subject: [PATCH 125/478] PCI: add SR-IOV API for Physical Function driver Add or remove the Virtual Function when the SR-IOV is enabled or disabled by the device driver. This can happen anytime rather than only at the device probe stage. Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/iov.c | 314 ++++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 2 + include/linux/pci.h | 19 ++- 3 files changed, 334 insertions(+), 1 deletion(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 5ddfc09a8d3f..d0ff8ad8f7ba 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -13,6 +13,7 @@ #include #include "pci.h" +#define VIRTFN_ID_LEN 16 static inline u8 virtfn_bus(struct pci_dev *dev, int id) { @@ -26,6 +27,284 @@ static inline u8 virtfn_devfn(struct pci_dev *dev, int id) dev->sriov->stride * id) & 0xff; } +static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr) +{ + int rc; + struct pci_bus *child; + + if (bus->number == busnr) + return bus; + + child = pci_find_bus(pci_domain_nr(bus), busnr); + if (child) + return child; + + child = pci_add_new_bus(bus, NULL, busnr); + if (!child) + return NULL; + + child->subordinate = busnr; + child->dev.parent = bus->bridge; + rc = pci_bus_add_child(child); + if (rc) { + pci_remove_bus(child); + return NULL; + } + + return child; +} + +static void virtfn_remove_bus(struct pci_bus *bus, int busnr) +{ + struct pci_bus *child; + + if (bus->number == busnr) + return; + + child = pci_find_bus(pci_domain_nr(bus), busnr); + BUG_ON(!child); + + if (list_empty(&child->devices)) + pci_remove_bus(child); +} + +static int virtfn_add(struct pci_dev *dev, int id, int reset) +{ + int i; + int rc; + u64 size; + char buf[VIRTFN_ID_LEN]; + struct pci_dev *virtfn; + struct resource *res; + struct pci_sriov *iov = dev->sriov; + + virtfn = alloc_pci_dev(); + if (!virtfn) + return -ENOMEM; + + mutex_lock(&iov->dev->sriov->lock); + virtfn->bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id)); + if (!virtfn->bus) { + kfree(virtfn); + mutex_unlock(&iov->dev->sriov->lock); + return -ENOMEM; + } + virtfn->devfn = virtfn_devfn(dev, id); + virtfn->vendor = dev->vendor; + pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device); + pci_setup_device(virtfn); + virtfn->dev.parent = dev->dev.parent; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = dev->resource + PCI_IOV_RESOURCES + i; + if (!res->parent) + continue; + virtfn->resource[i].name = pci_name(virtfn); + virtfn->resource[i].flags = res->flags; + size = resource_size(res); + do_div(size, iov->total); + virtfn->resource[i].start = res->start + size * id; + virtfn->resource[i].end = virtfn->resource[i].start + size - 1; + rc = request_resource(res, &virtfn->resource[i]); + BUG_ON(rc); + } + + if (reset) + pci_execute_reset_function(virtfn); + + pci_device_add(virtfn, virtfn->bus); + mutex_unlock(&iov->dev->sriov->lock); + + virtfn->physfn = pci_dev_get(dev); + virtfn->is_virtfn = 1; + + rc = pci_bus_add_device(virtfn); + if (rc) + goto failed1; + sprintf(buf, "virtfn%u", id); + rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf); + if (rc) + goto failed1; + rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn"); + if (rc) + goto failed2; + + kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE); + + return 0; + +failed2: + sysfs_remove_link(&dev->dev.kobj, buf); +failed1: + pci_dev_put(dev); + mutex_lock(&iov->dev->sriov->lock); + pci_remove_bus_device(virtfn); + virtfn_remove_bus(dev->bus, virtfn_bus(dev, id)); + mutex_unlock(&iov->dev->sriov->lock); + + return rc; +} + +static void virtfn_remove(struct pci_dev *dev, int id, int reset) +{ + char buf[VIRTFN_ID_LEN]; + struct pci_bus *bus; + struct pci_dev *virtfn; + struct pci_sriov *iov = dev->sriov; + + bus = pci_find_bus(pci_domain_nr(dev->bus), virtfn_bus(dev, id)); + if (!bus) + return; + + virtfn = pci_get_slot(bus, virtfn_devfn(dev, id)); + if (!virtfn) + return; + + pci_dev_put(virtfn); + + if (reset) { + device_release_driver(&virtfn->dev); + pci_execute_reset_function(virtfn); + } + + sprintf(buf, "virtfn%u", id); + sysfs_remove_link(&dev->dev.kobj, buf); + sysfs_remove_link(&virtfn->dev.kobj, "physfn"); + + mutex_lock(&iov->dev->sriov->lock); + pci_remove_bus_device(virtfn); + virtfn_remove_bus(dev->bus, virtfn_bus(dev, id)); + mutex_unlock(&iov->dev->sriov->lock); + + pci_dev_put(dev); +} + +static int sriov_enable(struct pci_dev *dev, int nr_virtfn) +{ + int rc; + int i, j; + int nres; + u16 offset, stride, initial; + struct resource *res; + struct pci_dev *pdev; + struct pci_sriov *iov = dev->sriov; + + if (!nr_virtfn) + return 0; + + if (iov->nr_virtfn) + return -EINVAL; + + pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial); + if (initial > iov->total || + (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total))) + return -EIO; + + if (nr_virtfn < 0 || nr_virtfn > iov->total || + (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial))) + return -EINVAL; + + pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); + pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset); + pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride); + if (!offset || (nr_virtfn > 1 && !stride)) + return -EIO; + + nres = 0; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = dev->resource + PCI_IOV_RESOURCES + i; + if (res->parent) + nres++; + } + if (nres != iov->nres) { + dev_err(&dev->dev, "not enough MMIO resources for SR-IOV\n"); + return -ENOMEM; + } + + iov->offset = offset; + iov->stride = stride; + + if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->subordinate) { + dev_err(&dev->dev, "SR-IOV: bus number out of range\n"); + return -ENOMEM; + } + + if (iov->link != dev->devfn) { + pdev = pci_get_slot(dev->bus, iov->link); + if (!pdev) + return -ENODEV; + + pci_dev_put(pdev); + + if (!pdev->is_physfn) + return -ENODEV; + + rc = sysfs_create_link(&dev->dev.kobj, + &pdev->dev.kobj, "dep_link"); + if (rc) + return rc; + } + + iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE; + pci_block_user_cfg_access(dev); + pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); + msleep(100); + pci_unblock_user_cfg_access(dev); + + iov->initial = initial; + if (nr_virtfn < initial) + initial = nr_virtfn; + + for (i = 0; i < initial; i++) { + rc = virtfn_add(dev, i, 0); + if (rc) + goto failed; + } + + kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE); + iov->nr_virtfn = nr_virtfn; + + return 0; + +failed: + for (j = 0; j < i; j++) + virtfn_remove(dev, j, 0); + + iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE); + pci_block_user_cfg_access(dev); + pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); + ssleep(1); + pci_unblock_user_cfg_access(dev); + + if (iov->link != dev->devfn) + sysfs_remove_link(&dev->dev.kobj, "dep_link"); + + return rc; +} + +static void sriov_disable(struct pci_dev *dev) +{ + int i; + struct pci_sriov *iov = dev->sriov; + + if (!iov->nr_virtfn) + return; + + for (i = 0; i < iov->nr_virtfn; i++) + virtfn_remove(dev, i, 0); + + iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE); + pci_block_user_cfg_access(dev); + pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); + ssleep(1); + pci_unblock_user_cfg_access(dev); + + if (iov->link != dev->devfn) + sysfs_remove_link(&dev->dev.kobj, "dep_link"); + + iov->nr_virtfn = 0; +} + static int sriov_init(struct pci_dev *dev, int pos) { int i; @@ -132,6 +411,8 @@ failed: static void sriov_release(struct pci_dev *dev) { + BUG_ON(dev->sriov->nr_virtfn); + if (dev == dev->sriov->dev) mutex_destroy(&dev->sriov->lock); else @@ -155,6 +436,7 @@ static void sriov_restore_state(struct pci_dev *dev) pci_update_resource(dev, i); pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz); + pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn); pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); if (iov->ctrl & PCI_SRIOV_CTRL_VFE) msleep(100); @@ -245,3 +527,35 @@ int pci_iov_bus_range(struct pci_bus *bus) return max ? max - bus->number : 0; } + +/** + * pci_enable_sriov - enable the SR-IOV capability + * @dev: the PCI device + * + * Returns 0 on success, or negative on failure. + */ +int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) +{ + might_sleep(); + + if (!dev->is_physfn) + return -ENODEV; + + return sriov_enable(dev, nr_virtfn); +} +EXPORT_SYMBOL_GPL(pci_enable_sriov); + +/** + * pci_disable_sriov - disable the SR-IOV capability + * @dev: the PCI device + */ +void pci_disable_sriov(struct pci_dev *dev) +{ + might_sleep(); + + if (!dev->is_physfn) + return; + + sriov_disable(dev); +} +EXPORT_SYMBOL_GPL(pci_disable_sriov); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index f4fc10fc5872..0f1c7d103509 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -209,6 +209,8 @@ struct pci_sriov { u32 cap; /* SR-IOV Capabilities */ u16 ctrl; /* SR-IOV Control */ u16 total; /* total VFs associated with the PF */ + u16 initial; /* initial VFs associated with the PF */ + u16 nr_virtfn; /* number of VFs available */ u16 offset; /* first VF Routing ID offset */ u16 stride; /* following VF stride */ u32 pgsz; /* page size for BAR alignment */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 8ce2f2d9ab63..c2e491e04063 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -265,6 +265,7 @@ struct pci_dev { unsigned int is_pcie:1; unsigned int state_saved:1; unsigned int is_physfn:1; + unsigned int is_virtfn:1; pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ @@ -279,7 +280,10 @@ struct pci_dev { #endif struct pci_vpd *vpd; #ifdef CONFIG_PCI_IOV - struct pci_sriov *sriov; /* SR-IOV capability related */ + union { + struct pci_sriov *sriov; /* SR-IOV capability related */ + struct pci_dev *physfn; /* the PF this VF is associated with */ + }; #endif }; @@ -1212,5 +1216,18 @@ int pci_ext_cfg_avail(struct pci_dev *dev); void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar); +#ifdef CONFIG_PCI_IOV +extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); +extern void pci_disable_sriov(struct pci_dev *dev); +#else +static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) +{ + return -ENODEV; +} +static inline void pci_disable_sriov(struct pci_dev *dev) +{ +} +#endif + #endif /* __KERNEL__ */ #endif /* LINUX_PCI_H */ From 74bb1bcc7dbbc9ddef773bf3395d7ff92aaaad2e Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:16 +0800 Subject: [PATCH 126/478] PCI: handle SR-IOV Virtual Function Migration Add or remove a Virtual Function after receiving a Migrate In or Out Request. Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/iov.c | 119 ++++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 4 ++ include/linux/pci.h | 6 +++ 3 files changed, 129 insertions(+) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index d0ff8ad8f7ba..7227efc760db 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -179,6 +179,97 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset) pci_dev_put(dev); } +static int sriov_migration(struct pci_dev *dev) +{ + u16 status; + struct pci_sriov *iov = dev->sriov; + + if (!iov->nr_virtfn) + return 0; + + if (!(iov->cap & PCI_SRIOV_CAP_VFM)) + return 0; + + pci_read_config_word(dev, iov->pos + PCI_SRIOV_STATUS, &status); + if (!(status & PCI_SRIOV_STATUS_VFM)) + return 0; + + schedule_work(&iov->mtask); + + return 1; +} + +static void sriov_migration_task(struct work_struct *work) +{ + int i; + u8 state; + u16 status; + struct pci_sriov *iov = container_of(work, struct pci_sriov, mtask); + + for (i = iov->initial; i < iov->nr_virtfn; i++) { + state = readb(iov->mstate + i); + if (state == PCI_SRIOV_VFM_MI) { + writeb(PCI_SRIOV_VFM_AV, iov->mstate + i); + state = readb(iov->mstate + i); + if (state == PCI_SRIOV_VFM_AV) + virtfn_add(iov->self, i, 1); + } else if (state == PCI_SRIOV_VFM_MO) { + virtfn_remove(iov->self, i, 1); + writeb(PCI_SRIOV_VFM_UA, iov->mstate + i); + state = readb(iov->mstate + i); + if (state == PCI_SRIOV_VFM_AV) + virtfn_add(iov->self, i, 0); + } + } + + pci_read_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, &status); + status &= ~PCI_SRIOV_STATUS_VFM; + pci_write_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, status); +} + +static int sriov_enable_migration(struct pci_dev *dev, int nr_virtfn) +{ + int bir; + u32 table; + resource_size_t pa; + struct pci_sriov *iov = dev->sriov; + + if (nr_virtfn <= iov->initial) + return 0; + + pci_read_config_dword(dev, iov->pos + PCI_SRIOV_VFM, &table); + bir = PCI_SRIOV_VFM_BIR(table); + if (bir > PCI_STD_RESOURCE_END) + return -EIO; + + table = PCI_SRIOV_VFM_OFFSET(table); + if (table + nr_virtfn > pci_resource_len(dev, bir)) + return -EIO; + + pa = pci_resource_start(dev, bir) + table; + iov->mstate = ioremap(pa, nr_virtfn); + if (!iov->mstate) + return -ENOMEM; + + INIT_WORK(&iov->mtask, sriov_migration_task); + + iov->ctrl |= PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR; + pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); + + return 0; +} + +static void sriov_disable_migration(struct pci_dev *dev) +{ + struct pci_sriov *iov = dev->sriov; + + iov->ctrl &= ~(PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR); + pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); + + cancel_work_sync(&iov->mtask); + iounmap(iov->mstate); +} + static int sriov_enable(struct pci_dev *dev, int nr_virtfn) { int rc; @@ -261,6 +352,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) goto failed; } + if (iov->cap & PCI_SRIOV_CAP_VFM) { + rc = sriov_enable_migration(dev, nr_virtfn); + if (rc) + goto failed; + } + kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE); iov->nr_virtfn = nr_virtfn; @@ -290,6 +387,9 @@ static void sriov_disable(struct pci_dev *dev) if (!iov->nr_virtfn) return; + if (iov->cap & PCI_SRIOV_CAP_VFM) + sriov_disable_migration(dev); + for (i = 0; i < iov->nr_virtfn; i++) virtfn_remove(dev, i, 0); @@ -559,3 +659,22 @@ void pci_disable_sriov(struct pci_dev *dev) sriov_disable(dev); } EXPORT_SYMBOL_GPL(pci_disable_sriov); + +/** + * pci_sriov_migration - notify SR-IOV core of Virtual Function Migration + * @dev: the PCI device + * + * Returns IRQ_HANDLED if the IRQ is handled, or IRQ_NONE if not. + * + * Physical Function driver is responsible to register IRQ handler using + * VF Migration Interrupt Message Number, and call this function when the + * interrupt is generated by the hardware. + */ +irqreturn_t pci_sriov_migration(struct pci_dev *dev) +{ + if (!dev->is_physfn) + return IRQ_NONE; + + return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE; +} +EXPORT_SYMBOL_GPL(pci_sriov_migration); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 0f1c7d103509..22dcfdb75d91 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -1,6 +1,8 @@ #ifndef DRIVERS_PCI_H #define DRIVERS_PCI_H +#include + #define PCI_CFG_SPACE_SIZE 256 #define PCI_CFG_SPACE_EXP_SIZE 4096 @@ -218,6 +220,8 @@ struct pci_sriov { struct pci_dev *dev; /* lowest numbered PF */ struct pci_dev *self; /* this PF */ struct mutex lock; /* lock for VF bus */ + struct work_struct mtask; /* VF Migration task */ + u8 __iomem *mstate; /* VF Migration State Array */ }; #ifdef CONFIG_PCI_IOV diff --git a/include/linux/pci.h b/include/linux/pci.h index c2e491e04063..1216843412da 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -52,6 +52,7 @@ #include #include #include +#include /* Include the ID list */ #include @@ -1219,6 +1220,7 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar); #ifdef CONFIG_PCI_IOV extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); extern void pci_disable_sriov(struct pci_dev *dev); +extern irqreturn_t pci_sriov_migration(struct pci_dev *dev); #else static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) { @@ -1227,6 +1229,10 @@ static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) static inline void pci_disable_sriov(struct pci_dev *dev) { } +static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev) +{ + return IRQ_NONE; +} #endif #endif /* __KERNEL__ */ From 01db4957179c92fda7d9a06e49b7ae56fb7c925b Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:17 +0800 Subject: [PATCH 127/478] PCI: document SR-IOV sysfs entries Reviewed-by: Randy Dunlap Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 3d29793a0ea2..d175a2a5fac2 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -68,3 +68,30 @@ Description: that some devices may have malformatted data. If the underlying VPD has a writable section then the corresponding section of this file will be writable. + +What: /sys/bus/pci/devices/.../virtfnN +Date: March 2009 +Contact: Yu Zhao +Description: + This symbolic link appears when hardware supports the SR-IOV + capability and the Physical Function driver has enabled it. + The symbolic link points to the PCI device sysfs entry of the + Virtual Function whose index is N (0...MaxVFs-1). + +What: /sys/bus/pci/devices/.../dep_link +Date: March 2009 +Contact: Yu Zhao +Description: + This symbolic link appears when hardware supports the SR-IOV + capability and the Physical Function driver has enabled it, + and this device has vendor specific dependencies with others. + The symbolic link points to the PCI device sysfs entry of + Physical Function this device depends on. + +What: /sys/bus/pci/devices/.../physfn +Date: March 2009 +Contact: Yu Zhao +Description: + This symbolic link appears when a device is a Virtual Function. + The symbolic link points to the PCI device sysfs entry of the + Physical Function this device associates with. From 15b49bee3a2b228370194f1b3ebc3db427cc9c94 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:18 +0800 Subject: [PATCH 128/478] PCI: manual for SR-IOV user and driver developer Reviewed-by: Randy Dunlap Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- Documentation/DocBook/kernel-api.tmpl | 1 + Documentation/PCI/pci-iov-howto.txt | 99 +++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 Documentation/PCI/pci-iov-howto.txt diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index bc962cda6504..58c194572c76 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -199,6 +199,7 @@ X!Edrivers/pci/hotplug.c --> !Edrivers/pci/probe.c !Edrivers/pci/rom.c +!Edrivers/pci/iov.c PCI Hotplug Support Library !Edrivers/pci/hotplug/pci_hotplug_core.c diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt new file mode 100644 index 000000000000..fc73ef5d65b8 --- /dev/null +++ b/Documentation/PCI/pci-iov-howto.txt @@ -0,0 +1,99 @@ + PCI Express I/O Virtualization Howto + Copyright (C) 2009 Intel Corporation + Yu Zhao + + +1. Overview + +1.1 What is SR-IOV + +Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended +capability which makes one physical device appear as multiple virtual +devices. The physical device is referred to as Physical Function (PF) +while the virtual devices are referred to as Virtual Functions (VF). +Allocation of the VF can be dynamically controlled by the PF via +registers encapsulated in the capability. By default, this feature is +not enabled and the PF behaves as traditional PCIe device. Once it's +turned on, each VF's PCI configuration space can be accessed by its own +Bus, Device and Function Number (Routing ID). And each VF also has PCI +Memory Space, which is used to map its register set. VF device driver +operates on the register set so it can be functional and appear as a +real existing PCI device. + +2. User Guide + +2.1 How can I enable SR-IOV capability + +The device driver (PF driver) will control the enabling and disabling +of the capability via API provided by SR-IOV core. If the hardware +has SR-IOV capability, loading its PF driver would enable it and all +VFs associated with the PF. + +2.2 How can I use the Virtual Functions + +The VF is treated as hot-plugged PCI devices in the kernel, so they +should be able to work in the same way as real PCI devices. The VF +requires device driver that is same as a normal PCI device's. + +3. Developer Guide + +3.1 SR-IOV API + +To enable SR-IOV capability: + int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); + 'nr_virtfn' is number of VFs to be enabled. + +To disable SR-IOV capability: + void pci_disable_sriov(struct pci_dev *dev); + +To notify SR-IOV core of Virtual Function Migration: + irqreturn_t pci_sriov_migration(struct pci_dev *dev); + +3.2 Usage example + +Following piece of code illustrates the usage of the SR-IOV API. + +static int __devinit dev_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + pci_enable_sriov(dev, NR_VIRTFN); + + ... + + return 0; +} + +static void __devexit dev_remove(struct pci_dev *dev) +{ + pci_disable_sriov(dev); + + ... +} + +static int dev_suspend(struct pci_dev *dev, pm_message_t state) +{ + ... + + return 0; +} + +static int dev_resume(struct pci_dev *dev) +{ + ... + + return 0; +} + +static void dev_shutdown(struct pci_dev *dev) +{ + ... +} + +static struct pci_driver dev_driver = { + .name = "SR-IOV Physical Function driver", + .id_table = dev_id_table, + .probe = dev_probe, + .remove = __devexit_p(dev_remove), + .suspend = dev_suspend, + .resume = dev_resume, + .shutdown = dev_shutdown, +}; From 5546d6f56807115a035d140f7364ce5807dbcc87 Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Thu, 19 Mar 2009 20:57:56 -0700 Subject: [PATCH 129/478] x86/PCI: Detect mmconfig on nVidia MCP55 Detect and enable memory-mapped PCI configuration space on the nVidia MCP55 southbridge. Tested against 2.6.27.4 on an Arista Networks development board with one MCP55, Coreboot firmware, no ACPI. Signed-off-by: Ed Swierk Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 64 ++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 89bf9242c80a..d68dc1bb01b2 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -154,6 +154,68 @@ static const char __init *pci_mmcfg_amd_fam10h(void) return "AMD Family 10h NB"; } +static bool __initdata mcp55_checked; +static const char __init *pci_mmcfg_nvidia_mcp55(void) +{ + int bus; + int mcp55_mmconf_found = 0; + + static const u32 extcfg_regnum = 0x90; + static const u32 extcfg_regsize = 4; + static const u32 extcfg_enable_mask = 1<<31; + static const u32 extcfg_start_mask = 0xff<<16; + static const int extcfg_start_shift = 16; + static const u32 extcfg_size_mask = 0x3<<28; + static const int extcfg_size_shift = 28; + static const int extcfg_sizebus[] = {0x100, 0x80, 0x40, 0x20}; + static const u32 extcfg_base_mask[] = {0x7ff8, 0x7ffc, 0x7ffe, 0x7fff}; + static const int extcfg_base_lshift = 25; + + /* + * do check if amd fam10h already took over + */ + if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) + return NULL; + + mcp55_checked = true; + for (bus = 0; bus < 256; bus++) { + u64 base; + u32 l, extcfg; + u16 vendor, device; + int start, size_index, end; + + raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), 0, 4, &l); + vendor = l & 0xffff; + device = (l >> 16) & 0xffff; + + if (PCI_VENDOR_ID_NVIDIA != vendor || 0x0369 != device) + continue; + + raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), extcfg_regnum, + extcfg_regsize, &extcfg); + + if (!(extcfg & extcfg_enable_mask)) + continue; + + if (extend_mmcfg(1) == -1) + continue; + + size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; + base = extcfg & extcfg_base_mask[size_index]; + /* base could > 4G */ + base <<= extcfg_base_lshift; + start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; + end = start + extcfg_sizebus[size_index] - 1; + fill_one_mmcfg(base, 0, start, end); + mcp55_mmconf_found++; + } + + if (!mcp55_mmconf_found) + return NULL; + + return "nVidia MCP55"; +} + struct pci_mmcfg_hostbridge_probe { u32 bus; u32 devfn; @@ -171,6 +233,8 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { 0x1200, pci_mmcfg_amd_fam10h }, { 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD, 0x1200, pci_mmcfg_amd_fam10h }, + { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_NVIDIA, + 0x0369, pci_mmcfg_nvidia_mcp55 }, }; static int __init pci_mmcfg_check_hostbridge(void) From fafad5bf06c3a3bb8b24b28b6f065367e7411872 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 20 Mar 2009 15:22:12 +1100 Subject: [PATCH 130/478] PCI MSI: Add example request loop to MSI-HOWTO.txt Encourage driver writers to think about supporting a variable number of MSI-X interrupts, and give an example of how to do such a request. Acked-by: Matthew Wilcox Signed-off-by: Michael Ellerman Signed-off-by: Jesse Barnes --- Documentation/PCI/MSI-HOWTO.txt | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 9494f6dc38eb..dcf7acc720e1 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -176,7 +176,8 @@ request_irq() for each 'vector' that it decides to use. If this function returns a negative number, it indicates an error and the driver should not attempt to allocate any more MSI-X interrupts for this device. If it returns a positive number, it indicates the maximum -number of interrupt vectors that could have been allocated. +number of interrupt vectors that could have been allocated. See example +below. This function, in contrast with pci_enable_msi(), does not adjust dev->irq. The device will not generate interrupts for this interrupt @@ -187,6 +188,26 @@ free them again later. Device drivers should normally call this function once per device during the initialization phase. +It is ideal if drivers can cope with a variable number of MSI-X interrupts, +there are many reasons why the platform may not be able to provide the +exact number a driver asks for. + +A request loop to achieve that might look like: + +static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) +{ + while (nvec >= FOO_DRIVER_MINIMUM_NVEC) { + rc = pci_enable_msix(adapter->pdev, + adapter->msix_entries, nvec); + if (rc > 0) + nvec = rc; + else + return rc; + } + + return -ENOSPC; +} + 4.3.2 pci_disable_msix void pci_disable_msix(struct pci_dev *dev) From 068258bc15439c11a966e873f931cc8e513dca61 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 19 Mar 2009 20:55:35 -0700 Subject: [PATCH 131/478] x86/PCI: host mmconfig detect clean up Fix mmconfig detection to not assume a single mmconfig space in the northbridge, paving the way for AMD fam10h + mcp55 CPUs. On those, the MSR has some range, but the mcp55 pci config will have another one. Also helps the mcp55 + io55 case, where every one will have one range. If it is mcp55, exclude the range that is used by CPU MSR, in other words , if the CPU claims busses 0-255, the range in mcp55 is dropped, because CPU HW will not route those ranges to mcp55 mmconfig to handle it. Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 161 ++++++++++++++++++++++----------- arch/x86/pci/mmconfig_64.c | 15 ++- 2 files changed, 118 insertions(+), 58 deletions(-) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index d68dc1bb01b2..905bb526b133 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -24,24 +25,49 @@ /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; +static __init int extend_mmcfg(int num) +{ + struct acpi_mcfg_allocation *new; + int new_num = pci_mmcfg_config_num + num; + + new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); + if (!new) + return -1; + + if (pci_mmcfg_config) { + memcpy(new, pci_mmcfg_config, + sizeof(pci_mmcfg_config[0]) * new_num); + kfree(pci_mmcfg_config); + } + pci_mmcfg_config = new; + + return 0; +} + +static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) +{ + int i = pci_mmcfg_config_num; + + pci_mmcfg_config_num++; + pci_mmcfg_config[i].address = addr; + pci_mmcfg_config[i].pci_segment = segment; + pci_mmcfg_config[i].start_bus_number = start; + pci_mmcfg_config[i].end_bus_number = end; +} + static const char __init *pci_mmcfg_e7520(void) { u32 win; raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win); win = win & 0xf000; - if(win == 0x0000 || win == 0xf000) - pci_mmcfg_config_num = 0; - else { - pci_mmcfg_config_num = 1; - pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); - if (!pci_mmcfg_config) - return NULL; - pci_mmcfg_config[0].address = win << 16; - pci_mmcfg_config[0].pci_segment = 0; - pci_mmcfg_config[0].start_bus_number = 0; - pci_mmcfg_config[0].end_bus_number = 255; - } + if (win == 0x0000 || win == 0xf000) + return NULL; + + if (extend_mmcfg(1) == -1) + return NULL; + + fill_one_mmcfg(win << 16, 0, 0, 255); return "Intel Corporation E7520 Memory Controller Hub"; } @@ -50,13 +76,11 @@ static const char __init *pci_mmcfg_intel_945(void) { u32 pciexbar, mask = 0, len = 0; - pci_mmcfg_config_num = 1; - raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0x48, 4, &pciexbar); /* Enable bit */ if (!(pciexbar & 1)) - pci_mmcfg_config_num = 0; + return NULL; /* Size bits */ switch ((pciexbar >> 1) & 3) { @@ -73,28 +97,23 @@ static const char __init *pci_mmcfg_intel_945(void) len = 0x04000000U; break; default: - pci_mmcfg_config_num = 0; + return NULL; } /* Errata #2, things break when not aligned on a 256Mb boundary */ /* Can only happen in 64M/128M mode */ if ((pciexbar & mask) & 0x0fffffffU) - pci_mmcfg_config_num = 0; + return NULL; /* Don't hit the APIC registers and their friends */ if ((pciexbar & mask) >= 0xf0000000U) - pci_mmcfg_config_num = 0; + return NULL; - if (pci_mmcfg_config_num) { - pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); - if (!pci_mmcfg_config) - return NULL; - pci_mmcfg_config[0].address = pciexbar & mask; - pci_mmcfg_config[0].pci_segment = 0; - pci_mmcfg_config[0].start_bus_number = 0; - pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1; - } + if (extend_mmcfg(1) == -1) + return NULL; + + fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1); return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; } @@ -138,18 +157,11 @@ static const char __init *pci_mmcfg_amd_fam10h(void) busnbits = 8; } - pci_mmcfg_config_num = (1 << segnbits); - pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]) * - pci_mmcfg_config_num, GFP_KERNEL); - if (!pci_mmcfg_config) + if (extend_mmcfg(1 << segnbits) == -1) return NULL; - for (i = 0; i < (1 << segnbits); i++) { - pci_mmcfg_config[i].address = base + (1<<28) * i; - pci_mmcfg_config[i].pci_segment = i; - pci_mmcfg_config[i].start_bus_number = 0; - pci_mmcfg_config[i].end_bus_number = (1 << busnbits) - 1; - } + for (i = 0; i < (1 << segnbits); i++) + fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); return "AMD Family 10h NB"; } @@ -237,6 +249,48 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { 0x0369, pci_mmcfg_nvidia_mcp55 }, }; +static int __init cmp_mmcfg(const void *x1, const void *x2) +{ + const typeof(pci_mmcfg_config[0]) *m1 = x1; + const typeof(pci_mmcfg_config[0]) *m2 = x2; + int start1, start2; + + start1 = m1->start_bus_number; + start2 = m2->start_bus_number; + + return start1 - start2; +} + +static void __init pci_mmcfg_check_end_bus_number(void) +{ + int i; + typeof(pci_mmcfg_config[0]) *cfg, *cfgx; + + /* sort them at first */ + sort(pci_mmcfg_config, pci_mmcfg_config_num, + sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL); + + /* last one*/ + if (pci_mmcfg_config_num > 0) { + i = pci_mmcfg_config_num - 1; + cfg = &pci_mmcfg_config[i]; + if (cfg->end_bus_number < cfg->start_bus_number) + cfg->end_bus_number = 255; + } + + /* don't overlap please */ + for (i = 0; i < pci_mmcfg_config_num - 1; i++) { + cfg = &pci_mmcfg_config[i]; + cfgx = &pci_mmcfg_config[i+1]; + + if (cfg->end_bus_number < cfg->start_bus_number) + cfg->end_bus_number = 255; + + if (cfg->end_bus_number >= cfgx->start_bus_number) + cfg->end_bus_number = cfgx->start_bus_number - 1; + } +} + static int __init pci_mmcfg_check_hostbridge(void) { u32 l; @@ -250,31 +304,33 @@ static int __init pci_mmcfg_check_hostbridge(void) pci_mmcfg_config_num = 0; pci_mmcfg_config = NULL; - name = NULL; - for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) { + for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { bus = pci_mmcfg_probes[i].bus; devfn = pci_mmcfg_probes[i].devfn; raw_pci_ops->read(0, bus, devfn, 0, 4, &l); vendor = l & 0xffff; device = (l >> 16) & 0xffff; + name = NULL; if (pci_mmcfg_probes[i].vendor == vendor && pci_mmcfg_probes[i].device == device) name = pci_mmcfg_probes[i].probe(); + + if (name) + printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", + name); } - if (name) { - printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n", - name, pci_mmcfg_config_num ? "with" : "without"); - } + /* some end_bus_number is crazy, fix it */ + pci_mmcfg_check_end_bus_number(); - return name != NULL; + return pci_mmcfg_config_num != 0; } static void __init pci_mmcfg_insert_resources(void) { -#define PCI_MMCFG_RESOURCE_NAME_LEN 19 +#define PCI_MMCFG_RESOURCE_NAME_LEN 24 int i; struct resource *res; char *names; @@ -292,9 +348,10 @@ static void __init pci_mmcfg_insert_resources(void) struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i]; num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; res->name = names; - snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", - cfg->pci_segment); - res->start = cfg->address; + snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment, + cfg->start_bus_number, cfg->end_bus_number); + res->start = cfg->address + (cfg->start_bus_number << 20); res->end = res->start + (num_buses << 20) - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; insert_resource(&iomem_resource, res); @@ -418,8 +475,6 @@ static void __init pci_mmcfg_reject_broken(int early) (pci_mmcfg_config[0].address == 0)) return; - cfg = &pci_mmcfg_config[0]; - for (i = 0; i < pci_mmcfg_config_num; i++) { int valid = 0; u64 addr, size; @@ -487,10 +542,10 @@ static void __init __pci_mmcfg_init(int early) known_bridge = 1; } - if (!known_bridge) { + if (!known_bridge) acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - pci_mmcfg_reject_broken(early); - } + + pci_mmcfg_reject_broken(early); if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 30007ffc8e11..94349f8b2f96 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -112,13 +112,18 @@ static struct pci_raw_ops pci_mmcfg = { static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) { void __iomem *addr; - u32 size; + u64 start, size; - size = (cfg->end_bus_number + 1) << 20; - addr = ioremap_nocache(cfg->address, size); + start = cfg->start_bus_number; + start <<= 20; + start += cfg->address; + size = cfg->end_bus_number + 1 - cfg->start_bus_number; + size <<= 20; + addr = ioremap_nocache(start, size); if (addr) { printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", - cfg->address, cfg->address + size - 1); + start, start + size - 1); + addr -= cfg->start_bus_number << 20; } return addr; } @@ -157,7 +162,7 @@ void __init pci_mmcfg_arch_free(void) for (i = 0; i < pci_mmcfg_config_num; ++i) { if (pci_mmcfg_virt[i].virt) { - iounmap(pci_mmcfg_virt[i].virt); + iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); pci_mmcfg_virt[i].virt = NULL; pci_mmcfg_virt[i].cfg = NULL; } From 79af72d716cf1bb13b175429cf181a6c4d063ee8 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Fri, 20 Mar 2009 14:55:55 -0600 Subject: [PATCH 132/478] PCI: pci_is_root_bus helper Introduce pci_is_root_bus helper function. This will help make code more consistent, as well as prevent incorrect assumptions (such as pci_bus->self == NULL on a root bus, which is not always true). Signed-off-by: Kenji Kaneshige Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- include/linux/pci.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/pci.h b/include/linux/pci.h index 1216843412da..50d94388e87c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -357,6 +357,15 @@ struct pci_bus { #define pci_bus_b(n) list_entry(n, struct pci_bus, node) #define to_pci_bus(n) container_of(n, struct pci_bus, dev) +/* + * Returns true if the pci bus is root (behind host-pci bridge), + * false otherwise + */ +static inline bool pci_is_root_bus(struct pci_bus *pbus) +{ + return !(pbus->parent); +} + #ifdef CONFIG_PCI_MSI static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev) { From 90bdb3117f4209baa6d712b126f0e7791b24dc3f Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Fri, 20 Mar 2009 14:56:00 -0600 Subject: [PATCH 133/478] PCI: don't scan existing devices pci_scan_single_device is supposed to add newly discovered devices to pci_bus->devices, but doesn't check to see if the device has already been added. This can cause problems if we ever want to use this interface to rescan the PCI bus. If the device is already added, just return it. Signed-off-by: Trent Piepho Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 943c49a7842c..140b9deb482c 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1019,6 +1019,12 @@ struct pci_dev *__ref pci_scan_single_device(struct pci_bus *bus, int devfn) { struct pci_dev *dev; + dev = pci_get_slot(bus, devfn); + if (dev) { + pci_dev_put(dev); + return dev; + } + dev = pci_scan_device(bus, devfn); if (!dev) return NULL; From 1b69dfc649e6658fc38499cf704750d74cabc73d Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Fri, 20 Mar 2009 14:56:05 -0600 Subject: [PATCH 134/478] PCI: pci_scan_slot() returns newly found devices pci_scan_slot() has been rewritten to be less complex and will now return the number of *new* devices found. Existing callers need not worry because they already assume that they can't call pci_scan_slot() on an already-scanned slot. Thus, there is no semantic change for existing callers: returning newly found devices (this patch) is exactly equal to returning all found devices (before this patch). This patch adds some more groundwork to allow us to rescan the PCI bus during runtime to discover newly added devices. Signed-off-by: Trent Piepho Reviewed-by: Alex Chiang Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 140b9deb482c..f3aabdf28f84 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1043,35 +1043,27 @@ EXPORT_SYMBOL(pci_scan_single_device); * Scan a PCI slot on the specified PCI bus for devices, adding * discovered devices to the @bus->devices list. New devices * will not have is_added set. + * + * Returns the number of new devices found. */ int pci_scan_slot(struct pci_bus *bus, int devfn) { - int func, nr = 0; - int scan_all_fns; + int fn, nr = 0; + struct pci_dev *dev; - scan_all_fns = pcibios_scan_all_fns(bus, devfn); + dev = pci_scan_single_device(bus, devfn); + if (dev && !dev->is_added) /* new device? */ + nr++; - for (func = 0; func < 8; func++, devfn++) { - struct pci_dev *dev; - - dev = pci_scan_single_device(bus, devfn); - if (dev) { - nr++; - - /* - * If this is a single function device, - * don't scan past the first function. - */ - if (!dev->multifunction) { - if (func > 0) { - dev->multifunction = 1; - } else { - break; - } + if ((dev && dev->multifunction) || + (!dev && pcibios_scan_all_fns(bus, devfn))) { + for (fn = 1; fn < 8; fn++) { + dev = pci_scan_single_device(bus, devfn + fn); + if (dev) { + if (!dev->is_added) + nr++; + dev->multifunction = 1; } - } else { - if (func == 0 && !scan_all_fns) - break; } } From 74710ded8e16fc8dacbb702a5bac1a493d88549a Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:10 -0600 Subject: [PATCH 135/478] PCI: always scan child buses While scanning bridges, we stop our scan if we encounter a bus that we've seen before, to work around some buggy chipsets. This is a good idea, but prevents us from fully scanning the PCI bus at a future time (to find newly hot-added devices, for example). Change the logic so that we skip _re-adding_ an existing bus that we've seen before, but also allow the scan to descend to all child buses. Now that we're potentially scanning our child buses again, we also need to be sure not to attempt re-initializing their BARs so we avoid that. This patch lays the groundwork to allow the user to issue a rescan of the PCI bus at any time. Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index f3aabdf28f84..f69256c63b2b 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -511,21 +511,21 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, /* * If we already got to this bus through a different bridge, - * ignore it. This can happen with the i450NX chipset. + * don't re-add it. This can happen with the i450NX chipset. + * + * However, we continue to descend down the hierarchy and + * scan remaining child buses. */ - if (pci_find_bus(pci_domain_nr(bus), busnr)) { - dev_info(&dev->dev, "bus %04x:%02x already known\n", - pci_domain_nr(bus), busnr); - goto out; + child = pci_find_bus(pci_domain_nr(bus), busnr); + if (!child) { + child = pci_add_new_bus(bus, dev, busnr); + if (!child) + goto out; + child->primary = buses & 0xFF; + child->subordinate = (buses >> 16) & 0xFF; + child->bridge_ctl = bctl; } - child = pci_add_new_bus(bus, dev, busnr); - if (!child) - goto out; - child->primary = buses & 0xFF; - child->subordinate = (buses >> 16) & 0xFF; - child->bridge_ctl = bctl; - cmax = pci_scan_child_bus(child); if (cmax > max) max = cmax; @@ -1092,8 +1092,14 @@ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) * After performing arch-dependent fixup of the bus, look behind * all PCI-to-PCI bridges on this bus. */ - pr_debug("PCI: Fixups for bus %04x:%02x\n", pci_domain_nr(bus), bus->number); - pcibios_fixup_bus(bus); + if (!bus->is_added) { + pr_debug("PCI: Fixups for bus %04x:%02x\n", + pci_domain_nr(bus), bus->number); + pcibios_fixup_bus(bus); + if (pci_is_root_bus(bus)) + bus->is_added = 1; + } + for (pass=0; pass < 2; pass++) list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || From b73e97d95c168cbc19bd1208c894077f25931ba1 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:15 -0600 Subject: [PATCH 136/478] PCI: do not initialize bridges more than once In preparation for PCI core hotplug, we need to ensure that we do not attempt to re-initialize bridges that have already been initialized. We only need to worry about non-root buses, since we will not allow root bus removal. Reported-by: Kenji Kaneshige Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/setup-bus.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 170a3eda9dd3..334285a8e237 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -144,6 +144,9 @@ static void pci_setup_bridge(struct pci_bus *bus) struct pci_bus_region region; u32 l, bu, lu, io_upper16; + if (!pci_is_root_bus(bus) && bus->is_added) + return; + dev_info(&bridge->dev, "PCI bridge, secondary bus %04x:%02x\n", pci_domain_nr(bus), bus->number); From 9dd90cafa7a712d283e2e0c625b022e19f746762 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:20 -0600 Subject: [PATCH 137/478] PCI: do not enable bridges more than once In preparation for PCI core hotplug, we need to ensure that we do not attempt to re-enable bridges that have already been enabled. Reported-by: Kenji Kaneshige Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/bus.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 118c77778d29..68f91a252595 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -184,8 +184,10 @@ void pci_enable_bridges(struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->subordinate) { - retval = pci_enable_device(dev); - pci_set_master(dev); + if (atomic_read(&dev->enable_cnt) == 0) { + retval = pci_enable_device(dev); + pci_set_master(dev); + } pci_enable_bridges(dev->subordinate); } } From 3ed4fd96b3188406ac5357d9290bcffa08c65cf6 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:25 -0600 Subject: [PATCH 138/478] PCI: Introduce pci_rescan_bus() This API is used by the PCI core to rescan a bus and rediscover newly added devices. Over time, it is expected that the various PCI hotplug drivers will migrate to this interface and away from the old pci_do_scan_bus() interface. Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/fakephp.c | 6 +++--- drivers/pci/probe.c | 32 ++++++++++++++++++++++++++++++++ include/linux/pci.h | 3 +++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c index d8649e127298..16063745766e 100644 --- a/drivers/pci/hotplug/fakephp.c +++ b/drivers/pci/hotplug/fakephp.c @@ -245,12 +245,12 @@ static int pci_rescan_slot(struct pci_dev *temp) /** - * pci_rescan_bus - Rescan PCI bus + * pci_rescan_bus_local - fakephp version of rescan PCI bus * @bus: the PCI bus to rescan * * Call pci_rescan_slot for each possible function of the bus. */ -static void pci_rescan_bus(const struct pci_bus *bus) +static void pci_rescan_bus_local(const struct pci_bus *bus) { unsigned int devfn; struct pci_dev *dev; @@ -291,7 +291,7 @@ static void pci_rescan_buses(const struct list_head *list) const struct list_head *l; list_for_each(l,list) { const struct pci_bus *b = pci_bus_b(l); - pci_rescan_bus(b); + pci_rescan_bus_local(b); pci_rescan_buses(&b->children); } } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index f69256c63b2b..60a8e5fec6c5 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1212,6 +1212,38 @@ struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent, EXPORT_SYMBOL(pci_scan_bus_parented); #ifdef CONFIG_HOTPLUG +/** + * pci_rescan_bus - scan a PCI bus for devices. + * @bus: PCI bus to scan + * + * Scan a PCI bus and child buses for new devices, adds them, + * and enables them. + * + * Returns the max number of subordinate bus discovered. + */ +unsigned int __devinit pci_rescan_bus(struct pci_bus *bus) +{ + unsigned int max; + struct pci_dev *dev; + + max = pci_scan_child_bus(bus); + + up_read(&pci_bus_sem); + list_for_each_entry(dev, &bus->devices, bus_list) + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + if (dev->subordinate) + pci_bus_size_bridges(dev->subordinate); + down_read(&pci_bus_sem); + + pci_bus_assign_resources(bus); + pci_enable_bridges(bus); + pci_bus_add_devices(bus); + + return max; +} +EXPORT_SYMBOL_GPL(pci_rescan_bus); + EXPORT_SYMBOL(pci_add_new_bus); EXPORT_SYMBOL(pci_scan_slot); EXPORT_SYMBOL(pci_scan_bridge); diff --git a/include/linux/pci.h b/include/linux/pci.h index 50d94388e87c..6fb335b0d74f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -726,6 +726,9 @@ int pci_back_from_sleep(struct pci_dev *dev); /* Functions for PCI Hotplug drivers to use */ int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap); +#ifdef CONFIG_HOTPLUG +unsigned int pci_rescan_bus(struct pci_bus *bus); +#endif /* Vital product data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); From 705b1aaa823e800490f157cd9366ad8cff385f5f Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:31 -0600 Subject: [PATCH 139/478] PCI: Introduce /sys/bus/pci/rescan This interface allows the user to force a rescan of all PCI buses in system, and rediscover devices that have been removed earlier. pci_bus_attrs implementation from Trent Piepho. Thanks to Vegard Nossum for discovering locking issues with the sysfs interface. Cc: Trent Piepho Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 9 +++++++++ drivers/pci/pci-driver.c | 1 + drivers/pci/pci-sysfs.c | 26 +++++++++++++++++++++++++ drivers/pci/pci.h | 6 ++++++ drivers/pci/probe.c | 4 ++-- 5 files changed, 44 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index d175a2a5fac2..e6ad047cb3c2 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -57,6 +57,15 @@ Description: match the driver to the device. For example: # echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id +What: /sys/bus/pci/rescan +Date: January 2009 +Contact: Linux PCI developers +Description: + Writing a non-zero value to this attribute will + force a rescan of all PCI buses in the system, and + re-discover previously removed devices. + Depends on CONFIG_HOTPLUG. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 87a5ddbb324f..95d198570290 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1049,6 +1049,7 @@ struct bus_type pci_bus_type = { .remove = pci_device_remove, .shutdown = pci_device_shutdown, .dev_attrs = pci_dev_attrs, + .bus_attrs = pci_bus_attrs, .pm = PCI_PM_OPS_PTR, }; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index ec7a175dcbaf..be7468a5eb72 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -219,6 +219,32 @@ msi_bus_store(struct device *dev, struct device_attribute *attr, return count; } +#ifdef CONFIG_HOTPLUG +static DEFINE_MUTEX(pci_remove_rescan_mutex); +static ssize_t bus_rescan_store(struct bus_type *bus, const char *buf, + size_t count) +{ + unsigned long val; + struct pci_bus *b = NULL; + + if (strict_strtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val) { + mutex_lock(&pci_remove_rescan_mutex); + while ((b = pci_find_next_bus(b)) != NULL) + pci_rescan_bus(b); + mutex_unlock(&pci_remove_rescan_mutex); + } + return count; +} + +struct bus_attribute pci_bus_attrs[] = { + __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store), + __ATTR_NULL +}; +#endif + struct device_attribute pci_dev_attrs[] = { __ATTR_RO(resource), __ATTR_RO(vendor), diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 22dcfdb75d91..45833a5bca61 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -138,6 +138,12 @@ extern int pcie_mch_quirk; extern struct device_attribute pci_dev_attrs[]; extern struct device_attribute dev_attr_cpuaffinity; extern struct device_attribute dev_attr_cpulistaffinity; +#ifdef CONFIG_HOTPLUG +extern struct bus_attribute pci_bus_attrs[]; +#else +#define pci_bus_attrs NULL +#endif + /** * pci_match_one_device - Tell if a PCI device structure has a matching diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 60a8e5fec6c5..56c71e585f3d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1228,13 +1228,13 @@ unsigned int __devinit pci_rescan_bus(struct pci_bus *bus) max = pci_scan_child_bus(bus); - up_read(&pci_bus_sem); + down_read(&pci_bus_sem); list_for_each_entry(dev, &bus->devices, bus_list) if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) if (dev->subordinate) pci_bus_size_bridges(dev->subordinate); - down_read(&pci_bus_sem); + up_read(&pci_bus_sem); pci_bus_assign_resources(bus); pci_enable_bridges(bus); From 77c27c7b49d69d45ccb94e481653f024f1ac6650 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:36 -0600 Subject: [PATCH 140/478] PCI: Introduce /sys/bus/pci/devices/.../remove This patch adds an attribute named "remove" to a PCI device's sysfs directory. Writing a non-zero value to this attribute will remove the PCI device and any children of it. Trent Piepho wrote the original implementation and documentation. Thanks to Vegard Nossum for testing under kmemcheck and finding locking issues with the sysfs interface. Cc: Trent Piepho Tested-by: Vegard Nossum Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 8 ++++++ Documentation/filesystems/sysfs-pci.txt | 10 +++++++ drivers/pci/pci-sysfs.c | 36 +++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index e6ad047cb3c2..daa791a9e85e 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -66,6 +66,14 @@ Description: re-discover previously removed devices. Depends on CONFIG_HOTPLUG. +What: /sys/bus/pci/devices/.../remove +Date: January 2009 +Contact: Linux PCI developers +Description: + Writing a non-zero value to this attribute will + hot-remove the PCI device and any of its children. + Depends on CONFIG_HOTPLUG. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 9f8740ca3f3b..26e4b8bc53ee 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -12,6 +12,7 @@ that support it. For example, a given bus might look like this: | |-- enable | |-- irq | |-- local_cpus + | |-- remove | |-- resource | |-- resource0 | |-- resource1 @@ -36,6 +37,7 @@ files, each with their own function. enable Whether the device is enabled (ascii, rw) irq IRQ number (ascii, ro) local_cpus nearby CPU mask (cpumask, ro) + remove remove device from kernel's list (ascii, wo) resource PCI resource host addresses (ascii, ro) resource0..N PCI resource N, if present (binary, mmap) resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) @@ -46,6 +48,7 @@ files, each with their own function. ro - read only file rw - file is readable and writable + wo - write only file mmap - file is mmapable ascii - file contains ascii text binary - file contains binary data @@ -73,6 +76,13 @@ that the device must be enabled for a rom read to return data succesfully. In the event a driver is not bound to the device, it can be enabled using the 'enable' file, documented above. +The 'remove' file is used to remove the PCI device, by writing a non-zero +integer to the file. This does not involve any kind of hot-plug functionality, +e.g. powering off the device. The device is removed from the kernel's list of +PCI devices, the sysfs directory for it is removed, and the device will be +removed from any drivers attached to it. Removal of PCI root buses is +disallowed. + Accessing legacy resources through sysfs ---------------------------------------- diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index be7468a5eb72..e16990ecc024 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -243,6 +243,39 @@ struct bus_attribute pci_bus_attrs[] = { __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store), __ATTR_NULL }; + +static void remove_callback(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + mutex_lock(&pci_remove_rescan_mutex); + pci_remove_bus_device(pdev); + mutex_unlock(&pci_remove_rescan_mutex); +} + +static ssize_t +remove_store(struct device *dev, struct device_attribute *dummy, + const char *buf, size_t count) +{ + int ret = 0; + unsigned long val; + struct pci_dev *pdev = to_pci_dev(dev); + + if (strict_strtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (pci_is_root_bus(pdev->bus)) + return -EBUSY; + + /* An attribute cannot be unregistered by one of its own methods, + * so we have to use this roundabout approach. + */ + if (val) + ret = device_schedule_callback(dev, remove_callback); + if (ret) + count = ret; + return count; +} #endif struct device_attribute pci_dev_attrs[] = { @@ -263,6 +296,9 @@ struct device_attribute pci_dev_attrs[] = { __ATTR(broken_parity_status,(S_IRUGO|S_IWUSR), broken_parity_status_show,broken_parity_status_store), __ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store), +#ifdef CONFIG_HOTPLUG + __ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store), +#endif __ATTR_NULL, }; From 738a6396c223b486304dda778119dbbca563f019 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:41 -0600 Subject: [PATCH 141/478] PCI: Introduce /sys/bus/pci/devices/.../rescan This interface allows the user to force a rescan of the device's parent bus and all subordinate buses, and rediscover devices removed earlier from this part of the device tree. Cc: Trent Piepho Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 10 ++++++++++ drivers/pci/pci-sysfs.c | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index daa791a9e85e..97ad190e13af 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -74,6 +74,16 @@ Description: hot-remove the PCI device and any of its children. Depends on CONFIG_HOTPLUG. +What: /sys/bus/pci/devices/.../rescan +Date: January 2009 +Contact: Linux PCI developers +Description: + Writing a non-zero value to this attribute will + force a rescan of the device's parent bus and all + child buses, and re-discover devices removed earlier + from this part of the device tree. + Depends on CONFIG_HOTPLUG. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index e16990ecc024..e9a8706a6401 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -244,6 +244,24 @@ struct bus_attribute pci_bus_attrs[] = { __ATTR_NULL }; +static ssize_t +dev_rescan_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + struct pci_dev *pdev = to_pci_dev(dev); + + if (strict_strtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val) { + mutex_lock(&pci_remove_rescan_mutex); + pci_rescan_bus(pdev->bus); + mutex_unlock(&pci_remove_rescan_mutex); + } + return count; +} + static void remove_callback(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -298,6 +316,7 @@ struct device_attribute pci_dev_attrs[] = { __ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store), #ifdef CONFIG_HOTPLUG __ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store), + __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_rescan_store), #endif __ATTR_NULL, }; From 83dbf66f04b96e65c6c18436c16d40f9cf8630aa Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Fri, 20 Mar 2009 14:56:46 -0600 Subject: [PATCH 142/478] PCI Hotplug: restore fakephp interface with complete reimplementation A complete re-implementation of fakephp is necessary if it is to present its former interface (pre-2.6.27, when it broke). The reason is that PCI hotplug drivers call pci_hp_register(), which enforces the rule that only one /sys/bus/pci/slots/ file may be created per physical slot. The change breaks the old fakephp's assumption that it could create a file per function. So we re-implement fakephp to avoid using the standard PCI hotplug API so that we can restore the old fakephp user interface. It puts entries in /sys/bus/pci/slots with the names of all PCI devices/functions, exactly symmetrical to what is shown in /sys/bus/pci/devices. Each slots/ entry has a "power" attribute, which works the same way as the fakephp driver's power attribute has worked. There are a few improvements over old fakephp, which couldn't handle PCI devices being added or removed via a means outside of fakephp's knowledge. If a device was added another way, old fakephp didn't notice and didn't create the fake slot for it. If a device was removed another way, old fakephp didn't delete the fake slot for it (and accessing the stale slot caused an oops). The new implementation overcomes these limitations. As a consequence, removing a bridge with other devices behind it now works as well, which is something else old fakephp couldn't do previously. This duplicates a tiny bit of the code in the PCI core that does this same function. Re-using that code ends up being more complex than duplicating it, and it makes code in the PCI core more ugly just to support this legacy fakephp interface compatibility layer. Reviewed-by: James Cameron Signed-off-by: Trent Piepho Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/Makefile | 2 +- drivers/pci/hotplug/fakephp.c | 395 --------------------------- drivers/pci/hotplug/legacy_fakephp.c | 162 +++++++++++ 3 files changed, 163 insertions(+), 396 deletions(-) delete mode 100644 drivers/pci/hotplug/fakephp.c create mode 100644 drivers/pci/hotplug/legacy_fakephp.c diff --git a/drivers/pci/hotplug/Makefile b/drivers/pci/hotplug/Makefile index 2aa117c8cd87..3314fe881509 100644 --- a/drivers/pci/hotplug/Makefile +++ b/drivers/pci/hotplug/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_HOTPLUG_PCI_RPA_DLPAR) += rpadlpar_io.o obj-$(CONFIG_HOTPLUG_PCI_SGI) += sgi_hotplug.o # Link this last so it doesn't claim devices that have a real hotplug driver -obj-$(CONFIG_HOTPLUG_PCI_FAKE) += fakephp.o +obj-$(CONFIG_HOTPLUG_PCI_FAKE) += legacy_fakephp.o pci_hotplug-objs := pci_hotplug_core.o diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c deleted file mode 100644 index 16063745766e..000000000000 --- a/drivers/pci/hotplug/fakephp.c +++ /dev/null @@ -1,395 +0,0 @@ -/* - * Fake PCI Hot Plug Controller Driver - * - * Copyright (C) 2003 Greg Kroah-Hartman - * Copyright (C) 2003 IBM Corp. - * Copyright (C) 2003 Rolf Eike Beer - * - * Based on ideas and code from: - * Vladimir Kondratiev - * Rolf Eike Beer - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, version 2 of the License. - * - * Send feedback to - */ - -/* - * - * This driver will "emulate" removing PCI devices from the system. If - * the "power" file is written to with "0" then the specified PCI device - * will be completely removed from the kernel. - * - * WARNING, this does NOT turn off the power to the PCI device. This is - * a "logical" removal, not a physical or electrical removal. - * - * Use this module at your own risk, you have been warned! - * - * Enabling PCI devices is left as an exercise for the reader... - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../pci.h" - -#if !defined(MODULE) - #define MY_NAME "fakephp" -#else - #define MY_NAME THIS_MODULE->name -#endif - -#define dbg(format, arg...) \ - do { \ - if (debug) \ - printk(KERN_DEBUG "%s: " format, \ - MY_NAME , ## arg); \ - } while (0) -#define err(format, arg...) printk(KERN_ERR "%s: " format, MY_NAME , ## arg) -#define info(format, arg...) printk(KERN_INFO "%s: " format, MY_NAME , ## arg) - -#define DRIVER_AUTHOR "Greg Kroah-Hartman " -#define DRIVER_DESC "Fake PCI Hot Plug Controller Driver" - -struct dummy_slot { - struct list_head node; - struct hotplug_slot *slot; - struct pci_dev *dev; - struct work_struct remove_work; - unsigned long removed; -}; - -static int debug; -static int dup_slots; -static LIST_HEAD(slot_list); -static struct workqueue_struct *dummyphp_wq; - -static void pci_rescan_worker(struct work_struct *work); -static DECLARE_WORK(pci_rescan_work, pci_rescan_worker); - -static int enable_slot (struct hotplug_slot *slot); -static int disable_slot (struct hotplug_slot *slot); - -static struct hotplug_slot_ops dummy_hotplug_slot_ops = { - .owner = THIS_MODULE, - .enable_slot = enable_slot, - .disable_slot = disable_slot, -}; - -static void dummy_release(struct hotplug_slot *slot) -{ - struct dummy_slot *dslot = slot->private; - - list_del(&dslot->node); - kfree(dslot->slot->info); - kfree(dslot->slot); - pci_dev_put(dslot->dev); - kfree(dslot); -} - -#define SLOT_NAME_SIZE 8 - -static int add_slot(struct pci_dev *dev) -{ - struct dummy_slot *dslot; - struct hotplug_slot *slot; - char name[SLOT_NAME_SIZE]; - int retval = -ENOMEM; - static int count = 1; - - slot = kzalloc(sizeof(struct hotplug_slot), GFP_KERNEL); - if (!slot) - goto error; - - slot->info = kzalloc(sizeof(struct hotplug_slot_info), GFP_KERNEL); - if (!slot->info) - goto error_slot; - - slot->info->power_status = 1; - slot->info->max_bus_speed = PCI_SPEED_UNKNOWN; - slot->info->cur_bus_speed = PCI_SPEED_UNKNOWN; - - dslot = kzalloc(sizeof(struct dummy_slot), GFP_KERNEL); - if (!dslot) - goto error_info; - - if (dup_slots) - snprintf(name, SLOT_NAME_SIZE, "fake"); - else - snprintf(name, SLOT_NAME_SIZE, "fake%d", count++); - dbg("slot->name = %s\n", name); - slot->ops = &dummy_hotplug_slot_ops; - slot->release = &dummy_release; - slot->private = dslot; - - retval = pci_hp_register(slot, dev->bus, PCI_SLOT(dev->devfn), name); - if (retval) { - err("pci_hp_register failed with error %d\n", retval); - goto error_dslot; - } - - dbg("slot->name = %s\n", hotplug_slot_name(slot)); - dslot->slot = slot; - dslot->dev = pci_dev_get(dev); - list_add (&dslot->node, &slot_list); - return retval; - -error_dslot: - kfree(dslot); -error_info: - kfree(slot->info); -error_slot: - kfree(slot); -error: - return retval; -} - -static int __init pci_scan_buses(void) -{ - struct pci_dev *dev = NULL; - int lastslot = 0; - - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - if (PCI_FUNC(dev->devfn) > 0 && - lastslot == PCI_SLOT(dev->devfn)) - continue; - lastslot = PCI_SLOT(dev->devfn); - add_slot(dev); - } - - return 0; -} - -static void remove_slot(struct dummy_slot *dslot) -{ - int retval; - - dbg("removing slot %s\n", hotplug_slot_name(dslot->slot)); - retval = pci_hp_deregister(dslot->slot); - if (retval) - err("Problem unregistering a slot %s\n", - hotplug_slot_name(dslot->slot)); -} - -/* called from the single-threaded workqueue handler to remove a slot */ -static void remove_slot_worker(struct work_struct *work) -{ - struct dummy_slot *dslot = - container_of(work, struct dummy_slot, remove_work); - remove_slot(dslot); -} - -/** - * pci_rescan_slot - Rescan slot - * @temp: Device template. Should be set: bus and devfn. - * - * Tries hard not to re-enable already existing devices; - * also handles scanning of subfunctions. - */ -static int pci_rescan_slot(struct pci_dev *temp) -{ - struct pci_bus *bus = temp->bus; - struct pci_dev *dev; - int func; - u8 hdr_type; - int count = 0; - - if (!pci_read_config_byte(temp, PCI_HEADER_TYPE, &hdr_type)) { - temp->hdr_type = hdr_type & 0x7f; - if ((dev = pci_get_slot(bus, temp->devfn)) != NULL) - pci_dev_put(dev); - else { - dev = pci_scan_single_device(bus, temp->devfn); - if (dev) { - dbg("New device on %s function %x:%x\n", - bus->name, temp->devfn >> 3, - temp->devfn & 7); - count++; - } - } - /* multifunction device? */ - if (!(hdr_type & 0x80)) - return count; - - /* continue scanning for other functions */ - for (func = 1, temp->devfn++; func < 8; func++, temp->devfn++) { - if (pci_read_config_byte(temp, PCI_HEADER_TYPE, &hdr_type)) - continue; - temp->hdr_type = hdr_type & 0x7f; - - if ((dev = pci_get_slot(bus, temp->devfn)) != NULL) - pci_dev_put(dev); - else { - dev = pci_scan_single_device(bus, temp->devfn); - if (dev) { - dbg("New device on %s function %x:%x\n", - bus->name, temp->devfn >> 3, - temp->devfn & 7); - count++; - } - } - } - } - - return count; -} - - -/** - * pci_rescan_bus_local - fakephp version of rescan PCI bus - * @bus: the PCI bus to rescan - * - * Call pci_rescan_slot for each possible function of the bus. - */ -static void pci_rescan_bus_local(const struct pci_bus *bus) -{ - unsigned int devfn; - struct pci_dev *dev; - int retval; - int found = 0; - dev = alloc_pci_dev(); - if (!dev) - return; - - dev->bus = (struct pci_bus*)bus; - dev->sysdata = bus->sysdata; - for (devfn = 0; devfn < 0x100; devfn += 8) { - dev->devfn = devfn; - found += pci_rescan_slot(dev); - } - - if (found) { - pci_bus_assign_resources(bus); - list_for_each_entry(dev, &bus->devices, bus_list) { - /* Skip already-added devices */ - if (dev->is_added) - continue; - retval = pci_bus_add_device(dev); - if (retval) - dev_err(&dev->dev, - "Error adding device, continuing\n"); - else - add_slot(dev); - } - pci_bus_add_devices(bus); - } - kfree(dev); -} - -/* recursively scan all buses */ -static void pci_rescan_buses(const struct list_head *list) -{ - const struct list_head *l; - list_for_each(l,list) { - const struct pci_bus *b = pci_bus_b(l); - pci_rescan_bus_local(b); - pci_rescan_buses(&b->children); - } -} - -/* initiate rescan of all pci buses */ -static inline void pci_rescan(void) { - pci_rescan_buses(&pci_root_buses); -} - -/* called from the single-threaded workqueue handler to rescan all pci buses */ -static void pci_rescan_worker(struct work_struct *work) -{ - pci_rescan(); -} - -static int enable_slot(struct hotplug_slot *hotplug_slot) -{ - /* mis-use enable_slot for rescanning of the pci bus */ - cancel_work_sync(&pci_rescan_work); - queue_work(dummyphp_wq, &pci_rescan_work); - return 0; -} - -static int disable_slot(struct hotplug_slot *slot) -{ - struct dummy_slot *dslot; - struct pci_dev *dev; - int func; - - if (!slot) - return -ENODEV; - dslot = slot->private; - - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot_name(slot)); - - for (func = 7; func >= 0; func--) { - dev = pci_get_slot(dslot->dev->bus, dslot->dev->devfn + func); - if (!dev) - continue; - - if (test_and_set_bit(0, &dslot->removed)) { - dbg("Slot already scheduled for removal\n"); - pci_dev_put(dev); - return -ENODEV; - } - - /* remove the device from the pci core */ - pci_remove_bus_device(dev); - - /* queue work item to blow away this sysfs entry and other - * parts. - */ - INIT_WORK(&dslot->remove_work, remove_slot_worker); - queue_work(dummyphp_wq, &dslot->remove_work); - - pci_dev_put(dev); - } - return 0; -} - -static void cleanup_slots (void) -{ - struct list_head *tmp; - struct list_head *next; - struct dummy_slot *dslot; - - destroy_workqueue(dummyphp_wq); - list_for_each_safe (tmp, next, &slot_list) { - dslot = list_entry (tmp, struct dummy_slot, node); - remove_slot(dslot); - } - -} - -static int __init dummyphp_init(void) -{ - info(DRIVER_DESC "\n"); - - dummyphp_wq = create_singlethread_workqueue(MY_NAME); - if (!dummyphp_wq) - return -ENOMEM; - - return pci_scan_buses(); -} - - -static void __exit dummyphp_exit(void) -{ - cleanup_slots(); -} - -module_init(dummyphp_init); -module_exit(dummyphp_exit); - -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_LICENSE("GPL"); -module_param(debug, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(debug, "Debugging mode enabled or not"); -module_param(dup_slots, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(dup_slots, "Force duplicate slot names for debugging"); diff --git a/drivers/pci/hotplug/legacy_fakephp.c b/drivers/pci/hotplug/legacy_fakephp.c new file mode 100644 index 000000000000..2dc7828df480 --- /dev/null +++ b/drivers/pci/hotplug/legacy_fakephp.c @@ -0,0 +1,162 @@ +/* Works like the fakephp driver used to, except a little better. + * + * - It's possible to remove devices with subordinate busses. + * - New PCI devices that appear via any method, not just a fakephp triggered + * rescan, will be noticed. + * - Devices that are removed via any method, not just a fakephp triggered + * removal, will also be noticed. + * + * Uses nothing from the pci-hotplug subsystem. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../pci.h" + +struct legacy_slot { + struct kobject kobj; + struct pci_dev *dev; + struct list_head list; +}; + +static LIST_HEAD(legacy_list); + +static ssize_t legacy_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj); + strcpy(buf, "1\n"); + return 2; +} + +static void remove_callback(void *data) +{ + pci_remove_bus_device((struct pci_dev *)data); +} + +static ssize_t legacy_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj); + unsigned long val; + + if (strict_strtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val) + pci_rescan_bus(slot->dev->bus); + else + sysfs_schedule_callback(&slot->dev->dev.kobj, remove_callback, + slot->dev, THIS_MODULE); + return len; +} + +static struct attribute *legacy_attrs[] = { + &(struct attribute){ .name = "power", .mode = 0644 }, + NULL, +}; + +static void legacy_release(struct kobject *kobj) +{ + struct legacy_slot *slot = container_of(kobj, typeof(*slot), kobj); + + pci_dev_put(slot->dev); + kfree(slot); +} + +static struct kobj_type legacy_ktype = { + .sysfs_ops = &(struct sysfs_ops){ + .store = legacy_store, .show = legacy_show + }, + .release = &legacy_release, + .default_attrs = legacy_attrs, +}; + +static int legacy_add_slot(struct pci_dev *pdev) +{ + struct legacy_slot *slot = kzalloc(sizeof(*slot), GFP_KERNEL); + + if (!slot) + return -ENOMEM; + + if (kobject_init_and_add(&slot->kobj, &legacy_ktype, + &pci_slots_kset->kobj, "%s", + pdev->dev.bus_id)) { + dev_warn(&pdev->dev, "Failed to created legacy fake slot\n"); + return -EINVAL; + } + slot->dev = pci_dev_get(pdev); + + list_add(&slot->list, &legacy_list); + + return 0; +} + +static int legacy_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct pci_dev *pdev = to_pci_dev(data); + + if (action == BUS_NOTIFY_ADD_DEVICE) { + legacy_add_slot(pdev); + } else if (action == BUS_NOTIFY_DEL_DEVICE) { + struct legacy_slot *slot; + + list_for_each_entry(slot, &legacy_list, list) + if (slot->dev == pdev) + goto found; + + dev_warn(&pdev->dev, "Missing legacy fake slot?"); + return -ENODEV; +found: + kobject_del(&slot->kobj); + list_del(&slot->list); + kobject_put(&slot->kobj); + } + + return 0; +} + +static struct notifier_block legacy_notifier = { + .notifier_call = legacy_notify +}; + +static int __init init_legacy(void) +{ + struct pci_dev *pdev = NULL; + + /* Add existing devices */ + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) + legacy_add_slot(pdev); + + /* Be alerted of any new ones */ + bus_register_notifier(&pci_bus_type, &legacy_notifier); + return 0; +} +module_init(init_legacy); + +static void __exit remove_legacy(void) +{ + struct legacy_slot *slot, *tmp; + + bus_unregister_notifier(&pci_bus_type, &legacy_notifier); + + list_for_each_entry_safe(slot, tmp, &legacy_list, list) { + list_del(&slot->list); + kobject_del(&slot->kobj); + kobject_put(&slot->kobj); + } +} +module_exit(remove_legacy); + + +MODULE_AUTHOR("Trent Piepho "); +MODULE_DESCRIPTION("Legacy version of the fakephp interface"); +MODULE_LICENSE("GPL"); From 8ffd25454738fb9ed76ee18cc0f180fb0b360401 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:51 -0600 Subject: [PATCH 143/478] PCI Hotplug: rename legacy_fakephp to fakephp We wanted to replace fakephp wholesale, so rename legacy_fakephp back to fakephp. Yes, this is a silly commit, but it produces a much easier patch to read and review. Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/Makefile | 2 +- drivers/pci/hotplug/{legacy_fakephp.c => fakephp.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename drivers/pci/hotplug/{legacy_fakephp.c => fakephp.c} (100%) diff --git a/drivers/pci/hotplug/Makefile b/drivers/pci/hotplug/Makefile index 3314fe881509..2aa117c8cd87 100644 --- a/drivers/pci/hotplug/Makefile +++ b/drivers/pci/hotplug/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_HOTPLUG_PCI_RPA_DLPAR) += rpadlpar_io.o obj-$(CONFIG_HOTPLUG_PCI_SGI) += sgi_hotplug.o # Link this last so it doesn't claim devices that have a real hotplug driver -obj-$(CONFIG_HOTPLUG_PCI_FAKE) += legacy_fakephp.o +obj-$(CONFIG_HOTPLUG_PCI_FAKE) += fakephp.o pci_hotplug-objs := pci_hotplug_core.o diff --git a/drivers/pci/hotplug/legacy_fakephp.c b/drivers/pci/hotplug/fakephp.c similarity index 100% rename from drivers/pci/hotplug/legacy_fakephp.c rename to drivers/pci/hotplug/fakephp.c From f110ca489c9b7cf3f6c9656e383e787f3aee217f Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:56 -0600 Subject: [PATCH 144/478] PCI Hotplug: schedule fakephp for feature removal Now that the PCI core is capable of function-level remove and rescan as well as bus-level rescan, there's no functional need to keep fakephp anymore. We keep it around for userspace compatibility reasons, schedule removal in three years. Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/feature-removal-schedule.txt | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 20d3b94703a4..8851eea06a02 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -335,6 +335,7 @@ Why: In 2.6.18 the Secmark concept was introduced to replace the "compat_net" Secmark, it is time to deprecate the older mechanism and start the process of removing the old code. Who: Paul Moore + --------------------------- What: sysfs ui for changing p4-clockmod parameters @@ -344,3 +345,35 @@ Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and Removal is subject to fixing any remaining bugs in ACPI which may cause the thermal throttling not to happen at the right time. Who: Dave Jones , Matthew Garrett + +--------------------------- + +What: fakephp and associated sysfs files in /sys/bus/pci/slots/ +When: 2011 +Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to + represent a machine's physical PCI slots. The change in semantics + had userspace implications, as the hotplug core no longer allowed + drivers to create multiple sysfs files per physical slot (required + for multi-function devices, e.g.). fakephp was seen as a developer's + tool only, and its interface changed. Too late, we learned that + there were some users of the fakephp interface. + + In 2.6.30, the original fakephp interface was restored. At the same + time, the PCI core gained the ability that fakephp provided, namely + function-level hot-remove and hot-add. + + Since the PCI core now provides the same functionality, exposed in: + + /sys/bus/pci/rescan + /sys/bus/pci/devices/.../remove + /sys/bus/pci/devices/.../rescan + + there is no functional reason to maintain fakephp as well. + + We will keep the existing module so that 'modprobe fakephp' will + present the old /sys/bus/pci/slots/... interface for compatibility, + but users are urged to migrate their applications to the API above. + + After a reasonable transition period, we will remove the legacy + fakephp interface. +Who: Alex Chiang From 9fa8cfe706f9c20067c042a064999d5825a35330 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 10:24:59 -0400 Subject: [PATCH 145/478] Btrfs: don't preallocate metadata blocks during btrfs_search_slot In order to avoid doing expensive extent management with tree locks held, btrfs_search_slot will preallocate tree blocks for use by COW without any tree locks held. A later commit moves all of the extent allocation work for COW into a delayed update mechanism, and this preallocation will no longer be required. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 100 ++++++++--------------------------------- fs/btrfs/ctree.h | 2 +- fs/btrfs/transaction.c | 4 +- 3 files changed, 21 insertions(+), 85 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 37f31b5529aa..87c90387283b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, * empty_size -- a hint that you plan on doing more cow. This is the size in * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. - * - * prealloc_dest -- if you have already reserved a destination for the cow, - * this uses that block instead of allocating a new one. - * btrfs_alloc_reserved_extent is used to finish the allocation. */ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - u64 prealloc_dest) + u64 search_start, u64 empty_size) { u64 parent_start; struct extent_buffer *cow; @@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); - if (prealloc_dest) { - struct btrfs_key ins; - - ins.objectid = prealloc_dest; - ins.offset = buf->len; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ret = btrfs_alloc_reserved_extent(trans, root, parent_start, - root->root_key.objectid, - trans->transid, level, &ins); - BUG_ON(ret); - cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len, level); - } else { - cow = btrfs_alloc_free_block(trans, root, buf->len, - parent_start, - root->root_key.objectid, - trans->transid, level, - search_start, empty_size); - } + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, root->root_key.objectid, + trans->transid, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, u64 prealloc_dest) + struct extent_buffer **cow_ret) { u64 search_start; int ret; @@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_owner(buf) == root->root_key.objectid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; - WARN_ON(prealloc_dest); return 0; } @@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, - prealloc_dest); + parent_slot, cow_ret, search_start, 0); return ret; } @@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize), 0); + (end_slot - i) * blocksize)); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child); BUG_ON(ret); spin_lock(&root->node_lock); @@ -979,7 +956,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(left); btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left, 0); + parent, pslot - 1, &left); if (wret) { ret = wret; goto enospc; @@ -990,7 +967,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(right); btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right, 0); + parent, pslot + 1, &right); if (wret) { ret = wret; goto enospc; @@ -1171,7 +1148,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left, 0); + pslot - 1, &left); if (ret) wret = 1; else { @@ -1222,7 +1199,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right, 0); + &right); if (ret) wret = 1; else { @@ -1492,7 +1469,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root u8 lowest_level = 0; u64 blocknr; u64 gen; - struct btrfs_key prealloc_block; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1501,8 +1477,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (ins_len < 0) lowest_unlock = 2; - prealloc_block.objectid = 0; - again: if (p->skip_locking) b = btrfs_root_node(root); @@ -1529,44 +1503,11 @@ again: !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { goto cow_done; } - - /* ok, we have to cow, is our old prealloc the right - * size? - */ - if (prealloc_block.objectid && - prealloc_block.offset != b->len) { - btrfs_release_path(root, p); - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - prealloc_block.objectid = 0; - goto again; - } - - /* - * for higher level blocks, try not to allocate blocks - * with the block and the parent locks held. - */ - if (level > 0 && !prealloc_block.objectid) { - u32 size = b->len; - u64 hint = b->start; - - btrfs_release_path(root, p); - ret = btrfs_reserve_extent(trans, root, - size, size, 0, - hint, (u64)-1, - &prealloc_block, 0); - BUG_ON(ret); - goto again; - } - btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], - &b, prealloc_block.objectid); - prealloc_block.objectid = 0; + p->slots[level + 1], &b); if (wret) { free_extent_buffer(b); ret = wret; @@ -1743,11 +1684,6 @@ done: * from here on, so for now just mark it as blocking */ btrfs_set_path_blocking(p); - if (prealloc_block.objectid) { - btrfs_free_reserved_extent(root, - prealloc_block.objectid, - prealloc_block.offset); - } return ret; } @@ -1768,7 +1704,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, int ret; eb = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb); BUG_ON(ret); btrfs_set_lock_blocking(eb); @@ -1826,7 +1762,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, } ret = btrfs_cow_block(trans, root, eb, parent, slot, - &eb, 0); + &eb); BUG_ON(ret); if (root->root_key.objectid == @@ -2377,7 +2313,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right, 0); + slot + 1, &right); if (ret) goto out_unlock; @@ -2576,7 +2512,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left, 0); + path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ ret = 1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e1d4e30e9d8..3a37ba7a8d65 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1838,7 +1838,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, u64 prealloc_dest); + struct extent_buffer **cow_ret); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4112d53d4f4d..d638c54d39e9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -463,7 +463,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, btrfs_extent_post_op(trans, fs_info->tree_root); eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -766,7 +766,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); + btrfs_cow_block(trans, root, old, NULL, 0, &old); btrfs_copy_root(trans, root, old, &tmp, objectid); btrfs_tree_unlock(old); From 56bec294dea971335d4466b30f2d959f28f6e36d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 10:10:06 -0400 Subject: [PATCH 146/478] Btrfs: do extent allocation and reference count updates in the background The extent allocation tree maintains a reference count and full back reference information for every extent allocated in the filesystem. For subvolume and snapshot trees, every time a block goes through COW, the new copy of the block adds a reference on every block it points to. If a btree node points to 150 leaves, then the COW code needs to go and add backrefs on 150 different extents, which might be spread all over the extent allocation tree. These updates currently happen during btrfs_cow_block, and most COWs happen during btrfs_search_slot. btrfs_search_slot has locks held on both the parent and the node we are COWing, and so we really want to avoid IO during the COW if we can. This commit adds an rbtree of pending reference count updates and extent allocations. The tree is ordered by byte number of the extent and byte number of the parent for the back reference. The tree allows us to: 1) Modify back references in something close to disk order, reducing seeks 2) Significantly reduce the number of modifications made as block pointers are balanced around 3) Do all of the extent insertion and back reference modifications outside of the performance critical btrfs_search_slot code. #3 has the added benefit of greatly reducing the btrfs stack footprint. The extent allocation tree modifications are done without the deep (and somewhat recursive) call chains used in the past. These delayed back reference updates must be done before the transaction commits, and so the rbtree is tied to the transaction. Throttling is implemented to help keep the queue of backrefs at a reasonable size. Since there was a similar mechanism in place for the extent tree extents, that is removed and replaced by the delayed reference tree. Yan Zheng helped review and fixup this code. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 3 +- fs/btrfs/ctree.h | 12 +- fs/btrfs/delayed-ref.c | 585 ++++++++++++++++ fs/btrfs/delayed-ref.h | 182 +++++ fs/btrfs/disk-io.c | 29 +- fs/btrfs/extent-tree.c | 1500 +++++++++++----------------------------- fs/btrfs/file.c | 6 +- fs/btrfs/transaction.c | 54 +- fs/btrfs/transaction.h | 3 + fs/btrfs/tree-defrag.c | 2 - 11 files changed, 1236 insertions(+), 1142 deletions(-) create mode 100644 fs/btrfs/delayed-ref.c create mode 100644 fs/btrfs/delayed-ref.h diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d2cf5a54a4b8..9adf5e4f7e96 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o + compression.o delayed-ref.o else # Normal Makefile diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 87c90387283b..bebc9fd16665 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -922,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, child->start, + child->len, mid->start, child->start, root->root_key.objectid, trans->transid, level - 1); @@ -2075,7 +2076,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, lower->start, - lower->start, c->start, + lower->len, lower->start, c->start, root->root_key.objectid, trans->transid, level - 1); BUG_ON(ret); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3a37ba7a8d65..ced5fd85dc36 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -688,8 +688,6 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; struct extent_io_tree pinned_extents; - struct extent_io_tree pending_del; - struct extent_io_tree extent_ins; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; @@ -717,7 +715,6 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; - struct mutex extent_ins_mutex; struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; @@ -1704,18 +1701,15 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs); int btrfs_update_pinned_extents(struct btrfs_root *root, u64 bytenr, u64 num, int pin); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 bytenr); -int btrfs_extent_post_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, @@ -1777,7 +1771,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 ref_generation, u64 owner_objectid); int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 root_objectid, u64 ref_generation, u64 owner_objectid); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c new file mode 100644 index 000000000000..874565a1f634 --- /dev/null +++ b/fs/btrfs/delayed-ref.c @@ -0,0 +1,585 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include "ctree.h" +#include "delayed-ref.h" +#include "transaction.h" + +/* + * delayed back reference update tracking. For subvolume trees + * we queue up extent allocations and backref maintenance for + * delayed processing. This avoids deep call chains where we + * add extents in the middle of btrfs_search_slot, and it allows + * us to buffer up frequently modified backrefs in an rb tree instead + * of hammering updates on the extent allocation tree. + * + * Right now this code is only used for reference counted trees, but + * the long term goal is to get rid of the similar code for delayed + * extent tree modifications. + */ + +/* + * entries in the rb tree are ordered by the byte number of the extent + * and by the byte number of the parent block. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 parent) +{ + if (bytenr < ref->bytenr) + return -1; + if (bytenr > ref->bytenr) + return 1; + if (parent < ref->parent) + return -1; + if (parent > ref->parent) + return 1; + return 0; +} + +/* + * insert a new ref into the rbtree. This returns any existing refs + * for the same (bytenr,parent) tuple, or NULL if the new node was properly + * inserted. + */ +static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, + u64 bytenr, u64 parent, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + int cmp; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + rb_node); + + cmp = comp_entry(entry, bytenr, parent); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else + return entry; + } + + entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * find an entry based on (bytenr,parent). This returns the delayed + * ref if it was able to find one, or NULL if nothing was in that spot + */ +static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, + u64 bytenr, u64 parent) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int cmp; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + cmp = comp_entry(entry, bytenr, parent); + if (cmp < 0) + n = n->rb_left; + else if (cmp > 0) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +/* + * Locking on delayed refs is done by taking a lock on the head node, + * which has the (impossible) parent id of (u64)-1. Once a lock is held + * on the head node, you're allowed (and required) to process all the + * delayed refs for a given byte number in the tree. + * + * This will walk forward in the rbtree until it finds a head node it + * is able to lock. It might not lock the delayed ref you asked for, + * and so it will return the one it did lock in next_ret and return 0. + * + * If no locks are taken, next_ret is set to null and 1 is returned. This + * means there are no more unlocked head nodes in the rbtree. + */ +int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_ref_head **next_ret) +{ + struct rb_node *node; + struct btrfs_delayed_ref_head *head; + int ret = 0; + + while (1) { + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + if (mutex_trylock(&head->mutex)) { + *next_ret = head; + ret = 0; + break; + } + } + node = rb_next(&ref->rb_node); + if (!node) { + ret = 1; + *next_ret = NULL; + break; + } + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + } + return ret; +} + +/* + * This checks to see if there are any delayed refs in the + * btree for a given bytenr. It returns one if it finds any + * and zero otherwise. + * + * If it only finds a head node, it returns 0. + * + * The idea is to use this when deciding if you can safely delete an + * extent from the extent allocation tree. There may be a pending + * ref in the rbtree that adds or removes references, so as long as this + * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent + * allocation tree. + */ +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *prev_node; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + if (ref) { + prev_node = rb_prev(&ref->rb_node); + if (!prev_node) + goto out; + ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr == bytenr) + ret = 1; + } +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +/* + * helper function to lookup reference count + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. This way you + * can check to see what the reference count would be if all of the + * delayed refs are processed. + */ +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u32 num_refs; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + delayed_refs = &trans->transaction->delayed_refs; +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret == 0) { + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + } else { + num_refs = 0; + ret = 0; + } + + spin_lock(&delayed_refs->lock); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + if (ref) { + head = btrfs_delayed_node_to_head(ref); + if (mutex_trylock(&head->mutex)) { + num_refs += ref->ref_mod; + mutex_unlock(&head->mutex); + *refs = num_refs; + goto out; + } + + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; + } else { + *refs = num_refs; + } +out: + spin_unlock(&delayed_refs->lock); + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update an extent delayed ref in the + * rbtree. existing and update must both have the same + * bytenr and parent + * + * This may free existing if the update cancels out whatever + * operation it was doing. + */ +static noinline void +update_existing_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref *existing_ref; + struct btrfs_delayed_ref *ref; + + existing_ref = btrfs_delayed_node_to_ref(existing); + ref = btrfs_delayed_node_to_ref(update); + + if (ref->pin) + existing_ref->pin = 1; + + if (ref->action != existing_ref->action) { + /* + * this is effectively undoing either an add or a + * drop. We decrement the ref_mod, and if it goes + * down to zero we just delete the entry without + * every changing the extent allocation tree. + */ + existing->ref_mod--; + if (existing->ref_mod == 0) { + rb_erase(&existing->rb_node, + &delayed_refs->root); + existing->in_tree = 0; + btrfs_put_delayed_ref(existing); + delayed_refs->num_entries--; + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; + } + } else { + if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { + /* if we're adding refs, make sure all the + * details match up. The extent could + * have been totally freed and reallocated + * by a different owner before the delayed + * ref entries were removed. + */ + existing_ref->owner_objectid = ref->owner_objectid; + existing_ref->generation = ref->generation; + existing_ref->root = ref->root; + existing->num_bytes = update->num_bytes; + } + /* + * the action on the existing ref matches + * the action on the ref we're trying to add. + * Bump the ref_mod by one so the backref that + * is eventually added/removed has the correct + * reference count + */ + existing->ref_mod += update->ref_mod; + } +} + +/* + * helper function to update the accounting in the head ref + * existing and update must have the same bytenr + */ +static noinline void +update_existing_head_ref(struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref_head *existing_ref; + struct btrfs_delayed_ref_head *ref; + + existing_ref = btrfs_delayed_node_to_head(existing); + ref = btrfs_delayed_node_to_head(update); + + if (ref->must_insert_reserved) { + /* if the extent was freed and then + * reallocated before the delayed ref + * entries were processed, we can end up + * with an existing head ref without + * the must_insert_reserved flag set. + * Set it again here + */ + existing_ref->must_insert_reserved = ref->must_insert_reserved; + + /* + * update the num_bytes so we make sure the accounting + * is done correctly + */ + existing->num_bytes = update->num_bytes; + + } + + /* + * update the reference mod on the head to reflect this new operation + */ + existing->ref_mod += update->ref_mod; +} + +/* + * helper function to actually insert a delayed ref into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count in the head node and properly dealing + * with updating existing nodes as new modifications are queued. + */ +static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref *full_ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int count_mod = 1; + int must_insert_reserved = 0; + + /* + * the head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ + if (parent == (u64)-1 && action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update + * the reserved accounting when the extent is finally added, or + * if a later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record + * that accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not required. + */ + if (action == BTRFS_ADD_DELAYED_EXTENT) { + must_insert_reserved = 1; + action = BTRFS_ADD_DELAYED_REF; + } else { + must_insert_reserved = 0; + } + + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->parent = parent; + ref->ref_mod = count_mod; + ref->in_tree = 1; + ref->num_bytes = num_bytes; + + if (btrfs_delayed_ref_is_head(ref)) { + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + mutex_init(&head_ref->mutex); + } else { + full_ref = btrfs_delayed_node_to_ref(ref); + full_ref->root = ref_root; + full_ref->generation = ref_generation; + full_ref->owner_objectid = owner_objectid; + full_ref->pin = pin; + full_ref->action = action; + } + + existing = tree_insert(&delayed_refs->root, bytenr, + parent, &ref->rb_node); + + if (existing) { + if (btrfs_delayed_ref_is_head(ref)) + update_existing_head_ref(existing, ref); + else + update_existing_ref(trans, delayed_refs, existing, ref); + + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * add a delayed ref to the tree. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + */ +int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin) +{ + struct btrfs_delayed_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + /* + * the parent = 0 case comes from cases where we don't actually + * know the parent yet. It will get updated later via a add/drop + * pair. + */ + if (parent == 0) + parent = bytenr; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, + (u64)-1, 0, 0, 0, action, pin); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, ref_generation, + owner_objectid, action, pin); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * add a delayed ref to the tree. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + * + * The main point of this call is to add and remove a backreference in a single + * shot, taking the lock only once, and only searching for the head node once. + * + * It is the same as doing a ref add and delete in two separate calls. + */ +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin) +{ + struct btrfs_delayed_ref *ref; + struct btrfs_delayed_ref *old_ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); + if (!old_ref) { + kfree(ref); + return -ENOMEM; + } + + /* + * the parent = 0 case comes from cases where we don't actually + * know the parent yet. It will get updated later via a add/drop + * pair. + */ + if (parent == 0) + parent = bytenr; + if (orig_parent == 0) + orig_parent = bytenr; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + kfree(old_ref); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, + (u64)-1, 0, 0, 0, + BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, ref_generation, + owner_objectid, BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, + orig_parent, orig_ref_root, + orig_ref_generation, owner_objectid, + BTRFS_DROP_DELAYED_REF, pin); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h new file mode 100644 index 000000000000..37919e5c007f --- /dev/null +++ b/fs/btrfs/delayed-ref.h @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __DELAYED_REF__ +#define __DELAYED_REF__ + +/* these are the possible values of struct btrfs_delayed_ref->action */ +#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ +#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ +#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ + +struct btrfs_delayed_ref_node { + struct rb_node rb_node; + + /* the starting bytenr of the extent */ + u64 bytenr; + + /* the parent our backref will point to */ + u64 parent; + + /* the size of the extent */ + u64 num_bytes; + + /* ref count on this data structure */ + atomic_t refs; + + /* + * how many refs is this entry adding or deleting. For + * head refs, this may be a negative number because it is keeping + * track of the total mods done to the reference count. + * For individual refs, this will always be a positive number + * + * It may be more than one, since it is possible for a single + * parent to have more than one ref on an extent + */ + int ref_mod; + + /* is this node still in the rbtree? */ + unsigned int in_tree:1; +}; + +/* + * the head refs are used to hold a lock on a given extent, which allows us + * to make sure that only one process is running the delayed refs + * at a time for a single extent. They also store the sum of all the + * reference count modifications we've queued up. + */ +struct btrfs_delayed_ref_head { + struct btrfs_delayed_ref_node node; + + /* + * the mutex is held while running the refs, and it is also + * held when checking the sum of reference modifications. + */ + struct mutex mutex; + + /* + * when a new extent is allocated, it is just reserved in memory + * The actual extent isn't inserted into the extent allocation tree + * until the delayed ref is processed. must_insert_reserved is + * used to flag a delayed ref so the accounting can be updated + * when a full insert is done. + * + * It is possible the extent will be freed before it is ever + * inserted into the extent allocation tree. In this case + * we need to update the in ram accounting to properly reflect + * the free has happened. + */ + unsigned int must_insert_reserved:1; +}; + +struct btrfs_delayed_ref { + struct btrfs_delayed_ref_node node; + + /* the root objectid our ref will point to */ + u64 root; + + /* the generation for the backref */ + u64 generation; + + /* owner_objectid of the backref */ + u64 owner_objectid; + + /* operation done by this entry in the rbtree */ + u8 action; + + /* if pin == 1, when the extent is freed it will be pinned until + * transaction commit + */ + unsigned int pin:1; +}; + +struct btrfs_delayed_ref_root { + struct rb_root root; + + /* this spin lock protects the rbtree and the entries inside */ + spinlock_t lock; + + /* how many delayed ref updates we've queued, used by the + * throttling code + */ + unsigned long num_entries; + + /* + * set when the tree is flushing before a transaction commit, + * used by the throttling code to decide if new updates need + * to be run right away + */ + int flushing; +}; + +static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + WARN_ON(atomic_read(&ref->refs) == 0); + if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(ref->in_tree); + kfree(ref); + } +} + +int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, int action, + int pin); + +struct btrfs_delayed_ref * +btrfs_find_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, + u64 parent); +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_ref_head **next_ret); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs); +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin); +/* + * a node might live in a head or a regular ref, this lets you + * test for the proper type to use. + */ +static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) +{ + return node->parent == (u64)-1; +} + +/* + * helper functions to cast a node into its container + */ +static inline struct btrfs_delayed_ref * +btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref, node); + +} + +static inline struct btrfs_delayed_ref_head * +btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(!btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref_head, node); + +} +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3e18175248e0..4f43e227a297 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1458,6 +1458,7 @@ static int transaction_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; + struct btrfs_fs_info *info = root->fs_info; unsigned long now; unsigned long delay; int ret; @@ -1471,12 +1472,6 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { - printk(KERN_INFO "btrfs: total reference cache " - "size %llu\n", - root->fs_info->total_ref_cache_size); - } - mutex_lock(&root->fs_info->trans_mutex); cur = root->fs_info->running_transaction; if (!cur) { @@ -1486,13 +1481,30 @@ static int transaction_kthread(void *arg) now = get_seconds(); if (now < cur->start_time || now - cur->start_time < 30) { + unsigned long num_delayed; + num_delayed = cur->delayed_refs.num_entries; mutex_unlock(&root->fs_info->trans_mutex); delay = HZ * 5; + + /* + * we may have been woken up early to start + * processing the delayed extent ref updates + * If so, run some of them and then loop around again + * to see if we need to force a commit + */ + if (num_delayed > 64) { + mutex_unlock(&info->transaction_kthread_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_run_delayed_refs(trans, root, 256); + btrfs_end_transaction(trans, root); + continue; + } goto sleep; } mutex_unlock(&root->fs_info->trans_mutex); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); + sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); @@ -1611,10 +1623,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, extent_io_tree_init(&fs_info->pinned_extents, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&fs_info->pending_del, - fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&fs_info->extent_ins, - fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; INIT_LIST_HEAD(&fs_info->dead_reloc_roots); @@ -1629,7 +1637,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); - mutex_init(&fs_info->extent_ins_mutex); mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fefe83ad2059..9b5da2b013e4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -49,10 +49,13 @@ struct pending_extent_op { int del; }; -static int finish_current_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all); -static int del_pending_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all); +static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins, + int ref_mod); +static int update_reserved_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int reserve); static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int is_data); @@ -60,6 +63,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, + int ref_to_drop); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -554,262 +563,13 @@ out: return ret; } -/* - * updates all the backrefs that are pending on update_list for the - * extent_root - */ -static noinline int update_backrefs(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct list_head *update_list) -{ - struct btrfs_key key; - struct btrfs_extent_ref *ref; - struct btrfs_fs_info *info = extent_root->fs_info; - struct pending_extent_op *op; - struct extent_buffer *leaf; - int ret = 0; - struct list_head *cur = update_list->next; - u64 ref_objectid; - u64 ref_root = extent_root->root_key.objectid; - - op = list_entry(cur, struct pending_extent_op, list); - -search: - key.objectid = op->bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = op->orig_parent; - - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); - BUG_ON(ret); - - leaf = path->nodes[0]; - -loop: - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - - ref_objectid = btrfs_ref_objectid(leaf, ref); - - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != op->orig_generation || - (ref_objectid != op->level && - ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { - printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " - "root %llu, owner %u\n", - (unsigned long long)op->bytenr, - (unsigned long long)op->orig_parent, - (unsigned long long)ref_root, op->level); - btrfs_print_leaf(extent_root, leaf); - BUG(); - } - - key.objectid = op->bytenr; - key.offset = op->parent; - key.type = BTRFS_EXTENT_REF_KEY; - ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); - BUG_ON(ret); - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - btrfs_set_ref_generation(leaf, ref, op->generation); - - cur = cur->next; - - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - - if (cur == update_list) { - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(extent_root, path); - goto out; - } - - op = list_entry(cur, struct pending_extent_op, list); - - path->slots[0]++; - while (path->slots[0] < btrfs_header_nritems(leaf)) { - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid == op->bytenr && - key.type == BTRFS_EXTENT_REF_KEY) - goto loop; - path->slots[0]++; - } - - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(extent_root, path); - goto search; - -out: - return 0; -} - -static noinline int insert_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct list_head *insert_list, int nr) -{ - struct btrfs_key *keys; - u32 *data_size; - struct pending_extent_op *op; - struct extent_buffer *leaf; - struct list_head *cur = insert_list->next; - struct btrfs_fs_info *info = extent_root->fs_info; - u64 ref_root = extent_root->root_key.objectid; - int i = 0, last = 0, ret; - int total = nr * 2; - - if (!nr) - return 0; - - keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); - if (!keys) - return -ENOMEM; - - data_size = kzalloc(total * sizeof(u32), GFP_NOFS); - if (!data_size) { - kfree(keys); - return -ENOMEM; - } - - list_for_each_entry(op, insert_list, list) { - keys[i].objectid = op->bytenr; - keys[i].offset = op->num_bytes; - keys[i].type = BTRFS_EXTENT_ITEM_KEY; - data_size[i] = sizeof(struct btrfs_extent_item); - i++; - - keys[i].objectid = op->bytenr; - keys[i].offset = op->parent; - keys[i].type = BTRFS_EXTENT_REF_KEY; - data_size[i] = sizeof(struct btrfs_extent_ref); - i++; - } - - op = list_entry(cur, struct pending_extent_op, list); - i = 0; - while (i < total) { - int c; - ret = btrfs_insert_some_items(trans, extent_root, path, - keys+i, data_size+i, total-i); - BUG_ON(ret < 0); - - if (last && ret > 1) - BUG(); - - leaf = path->nodes[0]; - for (c = 0; c < ret; c++) { - int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; - - /* - * if the first item we inserted was a backref, then - * the EXTENT_ITEM will be the odd c's, else it will - * be the even c's - */ - if ((ref_first && (c % 2)) || - (!ref_first && !(c % 2))) { - struct btrfs_extent_item *itm; - - itm = btrfs_item_ptr(leaf, path->slots[0] + c, - struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], itm, 1); - op->del++; - } else { - struct btrfs_extent_ref *ref; - - ref = btrfs_item_ptr(leaf, path->slots[0] + c, - struct btrfs_extent_ref); - btrfs_set_ref_root(leaf, ref, ref_root); - btrfs_set_ref_generation(leaf, ref, - op->generation); - btrfs_set_ref_objectid(leaf, ref, op->level); - btrfs_set_ref_num_refs(leaf, ref, 1); - op->del++; - } - - /* - * using del to see when its ok to free up the - * pending_extent_op. In the case where we insert the - * last item on the list in order to help do batching - * we need to not free the extent op until we actually - * insert the extent_item - */ - if (op->del == 2) { - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, - GFP_NOFS); - cur = cur->next; - list_del_init(&op->list); - kfree(op); - if (cur != insert_list) - op = list_entry(cur, - struct pending_extent_op, - list); - } - } - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(extent_root, path); - - /* - * Ok backref's and items usually go right next to eachother, - * but if we could only insert 1 item that means that we - * inserted on the end of a leaf, and we have no idea what may - * be on the next leaf so we just play it safe. In order to - * try and help this case we insert the last thing on our - * insert list so hopefully it will end up being the last - * thing on the leaf and everything else will be before it, - * which will let us insert a whole bunch of items at the same - * time. - */ - if (ret == 1 && !last && (i + ret < total)) { - /* - * last: where we will pick up the next time around - * i: our current key to insert, will be total - 1 - * cur: the current op we are screwing with - * op: duh - */ - last = i + ret; - i = total - 1; - cur = insert_list->prev; - op = list_entry(cur, struct pending_extent_op, list); - } else if (last) { - /* - * ok we successfully inserted the last item on the - * list, lets reset everything - * - * i: our current key to insert, so where we left off - * last time - * last: done with this - * cur: the op we are messing with - * op: duh - * total: since we inserted the last key, we need to - * decrement total so we dont overflow - */ - i = last; - last = 0; - total--; - if (i < total) { - cur = insert_list->next; - op = list_entry(cur, struct pending_extent_op, - list); - } - } else { - i += ret; - } - - cond_resched(); - } - ret = 0; - kfree(keys); - kfree(data_size); - return ret; -} - static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 bytenr, u64 parent, u64 ref_root, u64 ref_generation, - u64 owner_objectid) + u64 owner_objectid, + int refs_to_add) { struct btrfs_key key; struct extent_buffer *leaf; @@ -829,9 +589,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_ref_root(leaf, ref, ref_root); btrfs_set_ref_generation(leaf, ref, ref_generation); btrfs_set_ref_objectid(leaf, ref, owner_objectid); - btrfs_set_ref_num_refs(leaf, ref, 1); + btrfs_set_ref_num_refs(leaf, ref, refs_to_add); } else if (ret == -EEXIST) { u64 existing_owner; + BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -845,7 +606,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, num_refs = btrfs_ref_num_refs(leaf, ref); BUG_ON(num_refs == 0); - btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); + btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); existing_owner = btrfs_ref_objectid(leaf, ref); if (existing_owner != owner_objectid && @@ -865,7 +626,8 @@ out: static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path) + struct btrfs_path *path, + int refs_to_drop) { struct extent_buffer *leaf; struct btrfs_extent_ref *ref; @@ -875,8 +637,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); num_refs = btrfs_ref_num_refs(leaf, ref); - BUG_ON(num_refs == 0); - num_refs -= 1; + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { @@ -927,332 +689,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, #endif } -static noinline int free_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct list_head *del_list) -{ - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_path *path; - struct btrfs_key key, found_key; - struct extent_buffer *leaf; - struct list_head *cur; - struct pending_extent_op *op; - struct btrfs_extent_item *ei; - int ret, num_to_del, extent_slot = 0, found_extent = 0; - u32 refs; - u64 bytes_freed = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 1; - -search: - /* search for the backref for the current ref we want to delete */ - cur = del_list->next; - op = list_entry(cur, struct pending_extent_op, list); - ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, - op->orig_parent, - extent_root->root_key.objectid, - op->orig_generation, op->level, 1); - if (ret) { - printk(KERN_ERR "btrfs unable to find backref byte nr %llu " - "root %llu gen %llu owner %u\n", - (unsigned long long)op->bytenr, - (unsigned long long)extent_root->root_key.objectid, - (unsigned long long)op->orig_generation, op->level); - btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); - goto out; - } - - extent_slot = path->slots[0]; - num_to_del = 1; - found_extent = 0; - - /* - * if we aren't the first item on the leaf we can move back one and see - * if our ref is right next to our extent item - */ - if (likely(extent_slot)) { - extent_slot--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - extent_slot); - if (found_key.objectid == op->bytenr && - found_key.type == BTRFS_EXTENT_ITEM_KEY && - found_key.offset == op->num_bytes) { - num_to_del++; - found_extent = 1; - } - } - - /* - * if we didn't find the extent we need to delete the backref and then - * search for the extent item key so we can update its ref count - */ - if (!found_extent) { - key.objectid = op->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = op->num_bytes; - - ret = remove_extent_backref(trans, extent_root, path); - BUG_ON(ret); - btrfs_release_path(extent_root, path); - ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); - BUG_ON(ret); - extent_slot = path->slots[0]; - } - - /* this is where we update the ref count for the extent */ - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs == 0); - refs--; - btrfs_set_extent_refs(leaf, ei, refs); - - btrfs_mark_buffer_dirty(leaf); - - /* - * This extent needs deleting. The reason cur_slot is extent_slot + - * num_to_del is because extent_slot points to the slot where the extent - * is, and if the backref was not right next to the extent we will be - * deleting at least 1 item, and will want to start searching at the - * slot directly next to extent_slot. However if we did find the - * backref next to the extent item them we will be deleting at least 2 - * items and will want to start searching directly after the ref slot - */ - if (!refs) { - struct list_head *pos, *n, *end; - int cur_slot = extent_slot+num_to_del; - u64 super_used; - u64 root_used; - - path->slots[0] = extent_slot; - bytes_freed = op->num_bytes; - - mutex_lock(&info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, op->bytenr, - op->num_bytes, op->level >= - BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&info->pinned_mutex); - BUG_ON(ret < 0); - op->del = ret; - - /* - * we need to see if we can delete multiple things at once, so - * start looping through the list of extents we are wanting to - * delete and see if their extent/backref's are right next to - * eachother and the extents only have 1 ref - */ - for (pos = cur->next; pos != del_list; pos = pos->next) { - struct pending_extent_op *tmp; - - tmp = list_entry(pos, struct pending_extent_op, list); - - /* we only want to delete extent+ref at this stage */ - if (cur_slot >= btrfs_header_nritems(leaf) - 1) - break; - - btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); - if (found_key.objectid != tmp->bytenr || - found_key.type != BTRFS_EXTENT_ITEM_KEY || - found_key.offset != tmp->num_bytes) - break; - - /* check to make sure this extent only has one ref */ - ei = btrfs_item_ptr(leaf, cur_slot, - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, ei) != 1) - break; - - btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); - if (found_key.objectid != tmp->bytenr || - found_key.type != BTRFS_EXTENT_REF_KEY || - found_key.offset != tmp->orig_parent) - break; - - /* - * the ref is right next to the extent, we can set the - * ref count to 0 since we will delete them both now - */ - btrfs_set_extent_refs(leaf, ei, 0); - - /* pin down the bytes for this extent */ - mutex_lock(&info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, tmp->bytenr, - tmp->num_bytes, tmp->level >= - BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&info->pinned_mutex); - BUG_ON(ret < 0); - - /* - * use the del field to tell if we need to go ahead and - * free up the extent when we delete the item or not. - */ - tmp->del = ret; - bytes_freed += tmp->num_bytes; - - num_to_del += 2; - cur_slot += 2; - } - end = pos; - - /* update the free space counters */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - super_used - bytes_freed); - - root_used = btrfs_root_used(&extent_root->root_item); - btrfs_set_root_used(&extent_root->root_item, - root_used - bytes_freed); - spin_unlock(&info->delalloc_lock); - - /* delete the items */ - ret = btrfs_del_items(trans, extent_root, path, - path->slots[0], num_to_del); - BUG_ON(ret); - - /* - * loop through the extents we deleted and do the cleanup work - * on them - */ - for (pos = cur, n = pos->next; pos != end; - pos = n, n = pos->next) { - struct pending_extent_op *tmp; - tmp = list_entry(pos, struct pending_extent_op, list); - - /* - * remember tmp->del tells us wether or not we pinned - * down the extent - */ - ret = update_block_group(trans, extent_root, - tmp->bytenr, tmp->num_bytes, 0, - tmp->del); - BUG_ON(ret); - - list_del_init(&tmp->list); - unlock_extent(&info->extent_ins, tmp->bytenr, - tmp->bytenr + tmp->num_bytes - 1, - GFP_NOFS); - kfree(tmp); - } - } else if (refs && found_extent) { - /* - * the ref and extent were right next to eachother, but the - * extent still has a ref, so just free the backref and keep - * going - */ - ret = remove_extent_backref(trans, extent_root, path); - BUG_ON(ret); - - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - } else { - /* - * the extent has multiple refs and the backref we were looking - * for was not right next to it, so just unlock and go next, - * we're good to go - */ - list_del_init(&op->list); - unlock_extent(&info->extent_ins, op->bytenr, - op->bytenr + op->num_bytes - 1, GFP_NOFS); - kfree(op); - } - - btrfs_release_path(extent_root, path); - if (!list_empty(del_list)) - goto search; - -out: - btrfs_free_path(path); - return ret; -} - static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_root, u64 ref_root, u64 orig_generation, u64 ref_generation, u64 owner_objectid) { int ret; - struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_path *path; + int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; - if (root == root->fs_info->extent_root) { - struct pending_extent_op *extent_op; - u64 num_bytes; - - BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); - num_bytes = btrfs_level_size(root, (int)owner_objectid); - mutex_lock(&root->fs_info->extent_ins_mutex); - if (test_range_bit(&root->fs_info->extent_ins, bytenr, - bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { - u64 priv; - ret = get_state_private(&root->fs_info->extent_ins, - bytenr, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - BUG_ON(extent_op->parent != orig_parent); - BUG_ON(extent_op->generation != orig_generation); - - extent_op->parent = parent; - extent_op->generation = ref_generation; - } else { - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - - extent_op->type = PENDING_BACKREF_UPDATE; - extent_op->bytenr = bytenr; - extent_op->num_bytes = num_bytes; - extent_op->parent = parent; - extent_op->orig_parent = orig_parent; - extent_op->generation = ref_generation; - extent_op->orig_generation = orig_generation; - extent_op->level = (int)owner_objectid; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - set_extent_bits(&root->fs_info->extent_ins, - bytenr, bytenr + num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->extent_ins, - bytenr, (unsigned long)extent_op); - } - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = lookup_extent_backref(trans, extent_root, path, - bytenr, orig_parent, orig_root, - orig_generation, owner_objectid, 1); - if (ret) - goto out; - ret = remove_extent_backref(trans, extent_root, path); - if (ret) - goto out; - ret = insert_extent_backref(trans, extent_root, path, bytenr, - parent, ref_root, ref_generation, - owner_objectid); + ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, + orig_parent, parent, orig_root, + ref_root, orig_generation, + ref_generation, owner_objectid, pin); BUG_ON(ret); - finish_current_insert(trans, extent_root, 0); - del_pending_extents(trans, extent_root, 0); -out: - btrfs_free_path(path); return ret; } int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 orig_parent, u64 parent, + u64 num_bytes, u64 orig_parent, u64 parent, u64 ref_root, u64 ref_generation, u64 owner_objectid) { @@ -1260,19 +718,35 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, if (ref_root == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) return 0; - ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, - parent, ref_root, ref_root, - ref_generation, ref_generation, - owner_objectid); + + ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, parent, ref_root, + ref_root, ref_generation, + ref_generation, owner_objectid); return ret; } - static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_root, u64 ref_root, u64 orig_generation, u64 ref_generation, u64 owner_objectid) +{ + int ret; + + ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root, + ref_generation, owner_objectid, + BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + return ret; +} + +static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, + u64 ref_generation, u64 owner_objectid, + int refs_to_add) { struct btrfs_path *path; int ret; @@ -1288,15 +762,19 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path->reada = 1; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)-1; + key.offset = num_bytes; - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, - 0, 1); + /* first find the extent item and update its reference count */ + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); if (ret < 0) return ret; - BUG_ON(ret == 0 || path->slots[0] == 0); - path->slots[0]--; + if (ret > 0) { + WARN_ON(1); + btrfs_free_path(path); + return -EIO; + } l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); @@ -1310,21 +788,20 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(l, item); - btrfs_set_extent_refs(l, item, refs + 1); + btrfs_set_extent_refs(l, item, refs + refs_to_add); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(root->fs_info->extent_root, path); path->reada = 1; + /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, ref_root, ref_generation, - owner_objectid); + owner_objectid, refs_to_add); BUG_ON(ret); - finish_current_insert(trans, root->fs_info->extent_root, 0); - del_pending_extents(trans, root->fs_info->extent_root, 0); - btrfs_free_path(path); return 0; } @@ -1339,68 +816,245 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ref_root == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) return 0; - ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, + + ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, 0, ref_root, 0, ref_generation, owner_objectid); return ret; } -int btrfs_extent_post_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int drop_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node) +{ + int ret = 0; + struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); + + BUG_ON(node->ref_mod == 0); + ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, + node->parent, ref->root, ref->generation, + ref->owner_objectid, ref->pin, node->ref_mod); + + return ret; +} + +/* helper function to actually process a single delayed ref entry */ +static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + int insert_reserved) { - u64 start; - u64 end; int ret; + struct btrfs_delayed_ref *ref; - while(1) { - finish_current_insert(trans, root->fs_info->extent_root, 1); - del_pending_extents(trans, root->fs_info->extent_root, 1); + if (node->parent == (u64)-1) { + struct btrfs_delayed_ref_head *head; + /* + * we've hit the end of the chain and we were supposed + * to insert this extent into the tree. But, it got + * deleted before we ever needed to insert it, so all + * we have to do is clean up the accounting + */ + if (insert_reserved) { + update_reserved_extents(root, node->bytenr, + node->num_bytes, 0); + } + head = btrfs_delayed_node_to_head(node); + mutex_unlock(&head->mutex); + return 0; + } - /* is there more work to do? */ - ret = find_first_extent_bit(&root->fs_info->pending_del, - 0, &start, &end, EXTENT_WRITEBACK); - if (!ret) - continue; - ret = find_first_extent_bit(&root->fs_info->extent_ins, - 0, &start, &end, EXTENT_WRITEBACK); - if (!ret) - continue; - break; + ref = btrfs_delayed_node_to_ref(node); + if (ref->action == BTRFS_ADD_DELAYED_REF) { + if (insert_reserved) { + struct btrfs_key ins; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + /* record the full extent allocation */ + ret = __btrfs_alloc_reserved_extent(trans, root, + node->parent, ref->root, + ref->generation, ref->owner_objectid, + &ins, node->ref_mod); + update_reserved_extents(root, node->bytenr, + node->num_bytes, 0); + } else { + /* just add one backref */ + ret = add_extent_ref(trans, root, node->bytenr, + node->num_bytes, + node->parent, ref->root, ref->generation, + ref->owner_objectid, node->ref_mod); + } + BUG_ON(ret); + } else if (ref->action == BTRFS_DROP_DELAYED_REF) { + WARN_ON(insert_reserved); + ret = drop_delayed_ref(trans, root, node); } return 0; } -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs) +static noinline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct extent_buffer *l; - struct btrfs_extent_item *item; - - WARN_ON(num_bytes < root->sectorsize); - path = btrfs_alloc_path(); - path->reada = 1; - key.objectid = bytenr; - key.offset = num_bytes; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, - 0, 0); - if (ret < 0) - goto out; - if (ret != 0) { - btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_INFO "btrfs failed to find block number %llu\n", - (unsigned long long)bytenr); - BUG(); + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + int action = BTRFS_ADD_DELAYED_REF; +again: + /* + * select delayed ref of type BTRFS_ADD_DELAYED_REF first. + * this prevents ref count from going down to zero when + * there still are pending delayed ref. + */ + node = rb_prev(&head->node.rb_node); + while (1) { + if (!node) + break; + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr != head->node.bytenr) + break; + if (btrfs_delayed_node_to_ref(ref)->action == action) + return ref; + node = rb_prev(node); + } + if (action == BTRFS_ADD_DELAYED_REF) { + action = BTRFS_DROP_DELAYED_REF; + goto again; + } + return NULL; +} + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *locked_ref = NULL; + int ret; + int must_insert_reserved = 0; + int run_all = count == (unsigned long)-1; + + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; + + delayed_refs = &trans->transaction->delayed_refs; +again: + spin_lock(&delayed_refs->lock); + if (count == 0) + count = delayed_refs->num_entries; + while (1) { + if (!locked_ref) { + /* + * no locked ref, go find something we can + * process in the rbtree. We start at + * the beginning of the tree, there may be less + * lock contention if we do something smarter here. + */ + node = rb_first(&delayed_refs->root); + if (!node) { + spin_unlock(&delayed_refs->lock); + break; + } + + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref); + if (ret) { + spin_unlock(&delayed_refs->lock); + break; + } + } + + /* + * record the must insert reserved flag before we + * drop the spin lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + + ref = select_delayed_ref(locked_ref); + if (!ref) { + /* All delayed refs have been processed, Go ahead + * and send the head node to run_one_delayed_ref, + * so that any accounting fixes can happen + */ + ref = &locked_ref->node; + locked_ref = NULL; + } + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + spin_unlock(&delayed_refs->lock); + + ret = run_one_delayed_ref(trans, root, ref, + must_insert_reserved); + BUG_ON(ret); + btrfs_put_delayed_ref(ref); + + /* once we lock the head ref, we have to process all the + * entries for it. So, we might end up doing more entries + * that count was asking us to do. + */ + if (count > 0) + count--; + + /* + * we set locked_ref to null above if we're all done + * with this bytenr + */ + if (!locked_ref && count == 0) + break; + + spin_lock(&delayed_refs->lock); + } + if (run_all) { + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->root); + if (!node) { + spin_unlock(&delayed_refs->lock); + goto out; + } + + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + atomic_inc(&ref->refs); + + spin_unlock(&delayed_refs->lock); + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + + btrfs_put_delayed_ref(ref); + goto again; + } + node = rb_next(node); + } + spin_unlock(&delayed_refs->lock); + count = (unsigned long)-1; + schedule_timeout(1); + goto again; } - l = path->nodes[0]; - item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - *refs = btrfs_extent_refs(l, item); out: - btrfs_free_path(path); return 0; } @@ -1624,7 +1278,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, int refi = 0; int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); ref_generation = btrfs_header_generation(buf); @@ -1696,12 +1350,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, if (level == 0) { btrfs_item_key_to_cpu(buf, &key, slot); + fi = btrfs_item_ptr(buf, slot, + struct btrfs_file_extent_item); + + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; ret = process_func(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); + btrfs_file_extent_disk_num_bytes(buf, fi), + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); if (ret) { faili = slot; @@ -1709,7 +1370,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, goto fail; } } else { - ret = process_func(trans, root, bytenr, + ret = process_func(trans, root, bytenr, buf->len, orig_buf->start, buf->start, orig_root, ref_root, orig_generation, ref_generation, @@ -1786,17 +1447,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans, if (bytenr == 0) continue; ret = __btrfs_update_extent_ref(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); + btrfs_file_extent_disk_num_bytes(buf, fi), + orig_buf->start, buf->start, + orig_root, ref_root, orig_generation, + ref_generation, key.objectid); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, slot); ret = __btrfs_update_extent_ref(trans, root, bytenr, - orig_buf->start, buf->start, - orig_root, ref_root, + buf->len, orig_buf->start, + buf->start, orig_root, ref_root, orig_generation, ref_generation, level - 1); if (ret) @@ -1815,7 +1476,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache) { int ret; - int pending_ret; struct btrfs_root *extent_root = root->fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; @@ -1831,12 +1491,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(extent_root, path); fail: - finish_current_insert(trans, extent_root, 0); - pending_ret = del_pending_extents(trans, extent_root, 0); if (ret) return ret; - if (pending_ret) - return pending_ret; return 0; } @@ -2474,193 +2130,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return ret; } -static int finish_current_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all) -{ - u64 start; - u64 end; - u64 priv; - u64 search = 0; - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_path *path; - struct pending_extent_op *extent_op, *tmp; - struct list_head insert_list, update_list; - int ret; - int num_inserts = 0, max_inserts, restart = 0; - - path = btrfs_alloc_path(); - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - - max_inserts = extent_root->leafsize / - (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + - sizeof(struct btrfs_extent_ref) + - sizeof(struct btrfs_extent_item)); -again: - mutex_lock(&info->extent_ins_mutex); - while (1) { - ret = find_first_extent_bit(&info->extent_ins, search, &start, - &end, EXTENT_WRITEBACK); - if (ret) { - if (restart && !num_inserts && - list_empty(&update_list)) { - restart = 0; - search = 0; - continue; - } - break; - } - - ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); - if (!ret) { - if (all) - restart = 1; - search = end + 1; - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - continue; - } - - ret = get_state_private(&info->extent_ins, start, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *)(unsigned long) priv; - - if (extent_op->type == PENDING_EXTENT_INSERT) { - num_inserts++; - list_add_tail(&extent_op->list, &insert_list); - search = end + 1; - if (num_inserts == max_inserts) { - restart = 1; - break; - } - } else if (extent_op->type == PENDING_BACKREF_UPDATE) { - list_add_tail(&extent_op->list, &update_list); - search = end + 1; - } else { - BUG(); - } - } - - /* - * process the update list, clear the writeback bit for it, and if - * somebody marked this thing for deletion then just unlock it and be - * done, the free_extents will handle it - */ - list_for_each_entry_safe(extent_op, tmp, &update_list, list) { - clear_extent_bits(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - if (extent_op->del) { - list_del_init(&extent_op->list); - unlock_extent(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - - 1, GFP_NOFS); - kfree(extent_op); - } - } - mutex_unlock(&info->extent_ins_mutex); - - /* - * still have things left on the update list, go ahead an update - * everything - */ - if (!list_empty(&update_list)) { - ret = update_backrefs(trans, extent_root, path, &update_list); - BUG_ON(ret); - - /* we may have COW'ed new blocks, so lets start over */ - if (all) - restart = 1; - } - - /* - * if no inserts need to be done, but we skipped some extents and we - * need to make sure everything is cleaned then reset everything and - * go back to the beginning - */ - if (!num_inserts && restart) { - search = 0; - restart = 0; - INIT_LIST_HEAD(&update_list); - INIT_LIST_HEAD(&insert_list); - goto again; - } else if (!num_inserts) { - goto out; - } - - /* - * process the insert extents list. Again if we are deleting this - * extent, then just unlock it, pin down the bytes if need be, and be - * done with it. Saves us from having to actually insert the extent - * into the tree and then subsequently come along and delete it - */ - mutex_lock(&info->extent_ins_mutex); - list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { - clear_extent_bits(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - if (extent_op->del) { - u64 used; - list_del_init(&extent_op->list); - unlock_extent(&info->extent_ins, extent_op->bytenr, - extent_op->bytenr + extent_op->num_bytes - - 1, GFP_NOFS); - - mutex_lock(&extent_root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, - extent_op->bytenr, - extent_op->num_bytes, 0); - mutex_unlock(&extent_root->fs_info->pinned_mutex); - - spin_lock(&info->delalloc_lock); - used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - used - extent_op->num_bytes); - used = btrfs_root_used(&extent_root->root_item); - btrfs_set_root_used(&extent_root->root_item, - used - extent_op->num_bytes); - spin_unlock(&info->delalloc_lock); - - ret = update_block_group(trans, extent_root, - extent_op->bytenr, - extent_op->num_bytes, - 0, ret > 0); - BUG_ON(ret); - kfree(extent_op); - num_inserts--; - } - } - mutex_unlock(&info->extent_ins_mutex); - - ret = insert_extents(trans, extent_root, path, &insert_list, - num_inserts); - BUG_ON(ret); - - /* - * if restart is set for whatever reason we need to go back and start - * searching through the pending list again. - * - * We just inserted some extents, which could have resulted in new - * blocks being allocated, which would result in new blocks needing - * updates, so if all is set we _must_ restart to get the updated - * blocks. - */ - if (restart || all) { - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - search = 0; - restart = 0; - num_inserts = 0; - goto again; - } -out: - btrfs_free_path(path); - return 0; -} - static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int is_data) @@ -2686,6 +2155,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 header_transid = btrfs_header_generation(buf); if (header_owner != BTRFS_TREE_LOG_OBJECTID && header_owner != BTRFS_TREE_RELOC_OBJECTID && + header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { clean_tree_block(NULL, root, buf); @@ -2710,7 +2180,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, int mark_free) + u64 owner_objectid, int pin, int mark_free, + int refs_to_drop) { struct btrfs_path *path; struct btrfs_key key; @@ -2753,7 +2224,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, break; } if (!found_extent) { - ret = remove_extent_backref(trans, extent_root, path); + ret = remove_extent_backref(trans, extent_root, path, + refs_to_drop); BUG_ON(ret); btrfs_release_path(extent_root, path); ret = btrfs_search_slot(trans, extent_root, @@ -2771,8 +2243,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "root %llu gen %llu owner %llu\n", + "parent %llu root %llu gen %llu owner %llu\n", (unsigned long long)bytenr, + (unsigned long long)parent, (unsigned long long)root_objectid, (unsigned long long)ref_generation, (unsigned long long)owner_objectid); @@ -2782,17 +2255,23 @@ static int __free_extent(struct btrfs_trans_handle *trans, ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs == 0); - refs -= 1; - btrfs_set_extent_refs(leaf, ei, refs); + /* + * we're not allowed to delete the extent item if there + * are other delayed ref updates pending + */ + + BUG_ON(refs < refs_to_drop); + refs -= refs_to_drop; + btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); - if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { + if (refs == 0 && found_extent && + path->slots[0] == extent_slot + 1) { struct btrfs_extent_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); + BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); /* if the back ref and the extent are next to each other * they get deleted below in one shot */ @@ -2800,7 +2279,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, num_to_del = 2; } else if (found_extent) { /* otherwise delete the extent back ref */ - ret = remove_extent_backref(trans, extent_root, path); + ret = remove_extent_backref(trans, extent_root, path, + refs_to_drop); BUG_ON(ret); /* if refs are 0, we need to setup the path for deletion */ if (refs == 0) { @@ -2850,218 +2330,35 @@ static int __free_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } btrfs_free_path(path); - finish_current_insert(trans, extent_root, 0); return ret; } -/* - * find all the blocks marked as pending in the radix tree and remove - * them from the extent map - */ -static int del_pending_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, int all) -{ - int ret; - int err = 0; - u64 start; - u64 end; - u64 priv; - u64 search = 0; - int nr = 0, skipped = 0; - struct extent_io_tree *pending_del; - struct extent_io_tree *extent_ins; - struct pending_extent_op *extent_op; - struct btrfs_fs_info *info = extent_root->fs_info; - struct list_head delete_list; - - INIT_LIST_HEAD(&delete_list); - extent_ins = &extent_root->fs_info->extent_ins; - pending_del = &extent_root->fs_info->pending_del; - -again: - mutex_lock(&info->extent_ins_mutex); - while (1) { - ret = find_first_extent_bit(pending_del, search, &start, &end, - EXTENT_WRITEBACK); - if (ret) { - if (all && skipped && !nr) { - search = 0; - skipped = 0; - continue; - } - mutex_unlock(&info->extent_ins_mutex); - break; - } - - ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); - if (!ret) { - search = end+1; - skipped = 1; - - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - - continue; - } - BUG_ON(ret < 0); - - ret = get_state_private(pending_del, start, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *)(unsigned long)priv; - - clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, - GFP_NOFS); - if (!test_range_bit(extent_ins, start, end, - EXTENT_WRITEBACK, 0)) { - list_add_tail(&extent_op->list, &delete_list); - nr++; - } else { - kfree(extent_op); - - ret = get_state_private(&info->extent_ins, start, - &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - - clear_extent_bits(&info->extent_ins, start, end, - EXTENT_WRITEBACK, GFP_NOFS); - - if (extent_op->type == PENDING_BACKREF_UPDATE) { - list_add_tail(&extent_op->list, &delete_list); - search = end + 1; - nr++; - continue; - } - - mutex_lock(&extent_root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, extent_root, start, - end + 1 - start, 0); - mutex_unlock(&extent_root->fs_info->pinned_mutex); - - ret = update_block_group(trans, extent_root, start, - end + 1 - start, 0, ret > 0); - - unlock_extent(extent_ins, start, end, GFP_NOFS); - BUG_ON(ret); - kfree(extent_op); - } - if (ret) - err = ret; - - search = end + 1; - - if (need_resched()) { - mutex_unlock(&info->extent_ins_mutex); - cond_resched(); - mutex_lock(&info->extent_ins_mutex); - } - } - - if (nr) { - ret = free_extents(trans, extent_root, &delete_list); - BUG_ON(ret); - } - - if (all && skipped) { - INIT_LIST_HEAD(&delete_list); - search = 0; - nr = 0; - goto again; - } - - if (!err) - finish_current_insert(trans, extent_root, 0); - return err; -} - /* * remove an extent from the root, returns 0 on success */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin) + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, + int refs_to_drop) { - struct btrfs_root *extent_root = root->fs_info->extent_root; - int pending_ret; - int ret; - WARN_ON(num_bytes < root->sectorsize); - if (root == extent_root) { - struct pending_extent_op *extent_op = NULL; - mutex_lock(&root->fs_info->extent_ins_mutex); - if (test_range_bit(&root->fs_info->extent_ins, bytenr, - bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { - u64 priv; - ret = get_state_private(&root->fs_info->extent_ins, - bytenr, &priv); - BUG_ON(ret); - extent_op = (struct pending_extent_op *) - (unsigned long)priv; - - extent_op->del = 1; - if (extent_op->type == PENDING_EXTENT_INSERT) { - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - } - - if (extent_op) { - ref_generation = extent_op->orig_generation; - parent = extent_op->orig_parent; - } - - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - - extent_op->type = PENDING_EXTENT_DELETE; - extent_op->bytenr = bytenr; - extent_op->num_bytes = num_bytes; - extent_op->parent = parent; - extent_op->orig_parent = parent; - extent_op->generation = ref_generation; - extent_op->orig_generation = ref_generation; - extent_op->level = (int)owner_objectid; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - set_extent_bits(&root->fs_info->pending_del, - bytenr, bytenr + num_bytes - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->pending_del, - bytenr, (unsigned long)extent_op); - mutex_unlock(&root->fs_info->extent_ins_mutex); - return 0; - } - /* if metadata always pin */ - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - mutex_unlock(&root->fs_info->pinned_mutex); - update_reserved_extents(root, bytenr, num_bytes, 0); - return 0; - } + /* + * if metadata always pin + * if data pin when any transaction has committed this + */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID || + ref_generation != trans->transid) pin = 1; - } - /* if data pin when any transaction has committed this */ if (ref_generation != trans->transid) pin = 1; - ret = __free_extent(trans, root, bytenr, num_bytes, parent, + return __free_extent(trans, root, bytenr, num_bytes, parent, root_objectid, ref_generation, - owner_objectid, pin, pin == 0); - - finish_current_insert(trans, root->fs_info->extent_root, 0); - pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); - return ret ? ret : pending_ret; + owner_objectid, pin, pin == 0, refs_to_drop); } int btrfs_free_extent(struct btrfs_trans_handle *trans, @@ -3072,9 +2369,26 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, { int ret; - ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin); + /* + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. + * + * data extents referenced by the tree log do need to have + * their reference counts bumped. + */ + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + mutex_lock(&root->fs_info->pinned_mutex); + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + mutex_unlock(&root->fs_info->pinned_mutex); + update_reserved_extents(root, bytenr, num_bytes, 0); + ret = 0; + } else { + ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, + BTRFS_DROP_DELAYED_REF, 1); + } return ret; } @@ -3475,10 +2789,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) + u64 owner, struct btrfs_key *ins, + int ref_mod) { int ret; - int pending_ret; u64 super_used; u64 root_used; u64 num_bytes = ins->offset; @@ -3503,33 +2817,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, root_used + num_bytes); spin_unlock(&info->delalloc_lock); - if (root == extent_root) { - struct pending_extent_op *extent_op; - - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - - extent_op->type = PENDING_EXTENT_INSERT; - extent_op->bytenr = ins->objectid; - extent_op->num_bytes = ins->offset; - extent_op->parent = parent; - extent_op->orig_parent = 0; - extent_op->generation = ref_generation; - extent_op->orig_generation = 0; - extent_op->level = (int)owner; - INIT_LIST_HEAD(&extent_op->list); - extent_op->del = 0; - - mutex_lock(&root->fs_info->extent_ins_mutex); - set_extent_bits(&root->fs_info->extent_ins, ins->objectid, - ins->objectid + ins->offset - 1, - EXTENT_WRITEBACK, GFP_NOFS); - set_state_private(&root->fs_info->extent_ins, - ins->objectid, (unsigned long)extent_op); - mutex_unlock(&root->fs_info->extent_ins_mutex); - goto update_block; - } - memcpy(&keys[0], ins, sizeof(*ins)); keys[1].objectid = ins->objectid; keys[1].type = BTRFS_EXTENT_REF_KEY; @@ -3546,31 +2833,24 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], extent_item, 1); + btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_extent_ref); btrfs_set_ref_root(path->nodes[0], ref, root_objectid); btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); btrfs_set_ref_objectid(path->nodes[0], ref, owner); - btrfs_set_ref_num_refs(path->nodes[0], ref, 1); + btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); btrfs_mark_buffer_dirty(path->nodes[0]); trans->alloc_exclude_start = 0; trans->alloc_exclude_nr = 0; btrfs_free_path(path); - finish_current_insert(trans, extent_root, 0); - pending_ret = del_pending_extents(trans, extent_root, 0); if (ret) goto out; - if (pending_ret) { - ret = pending_ret; - goto out; - } -update_block: ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0); if (ret) { @@ -3592,9 +2872,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, if (root_objectid == BTRFS_TREE_LOG_OBJECTID) return 0; - ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins); - update_reserved_extents(root, ins->objectid, ins->offset, 0); + + ret = btrfs_add_delayed_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + ref_generation, owner, + BTRFS_ADD_DELAYED_EXTENT, 0); + BUG_ON(ret); return ret; } @@ -3621,7 +2904,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); put_block_group(block_group); ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins); + ref_generation, owner, ins, 1); return ret; } @@ -3640,20 +2923,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data) { int ret; - ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, empty_size, hint_byte, search_end, ins, data); BUG_ON(ret); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = __btrfs_alloc_reserved_extent(trans, root, parent, - root_objectid, ref_generation, - owner_objectid, ins); + ret = btrfs_add_delayed_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + ref_generation, owner_objectid, + BTRFS_ADD_DELAYED_EXTENT, 0); BUG_ON(ret); - - } else { - update_reserved_extents(root, ins->objectid, ins->offset, 1); } + update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -3789,7 +3070,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - ret = __btrfs_free_extent(trans, root, disk_bytenr, + ret = btrfs_free_extent(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(leaf, fi), leaf->start, leaf_owner, leaf_generation, key.objectid, 0); @@ -3829,7 +3110,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, */ for (i = 0; i < ref->nritems; i++) { info = ref->extents + sorted[i].slot; - ret = __btrfs_free_extent(trans, root, info->bytenr, + ret = btrfs_free_extent(trans, root, info->bytenr, info->num_bytes, ref->bytenr, ref->owner, ref->generation, info->objectid, 0); @@ -3846,12 +3127,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } -static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, +static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 start, u64 len, u32 *refs) { int ret; - ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); + ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); BUG_ON(ret); #if 0 /* some debugging code in case we see problems here */ @@ -3959,7 +3241,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, * we just decrement it below and don't update any * of the refs the leaf points to. */ - ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + ret = drop_snap_lookup_refcount(trans, root, bytenr, + blocksize, &refs); BUG_ON(ret); if (refs != 1) continue; @@ -4010,7 +3293,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, */ for (i = 0; i < refi; i++) { bytenr = sorted[i].bytenr; - ret = __btrfs_free_extent(trans, root, bytenr, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, eb->start, root_owner, root_gen, 0, 1); BUG_ON(ret); @@ -4053,7 +3336,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, + ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, path->nodes[*level]->len, &refs); BUG_ON(ret); if (refs > 1) @@ -4104,7 +3387,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); - ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + ret = drop_snap_lookup_refcount(trans, root, bytenr, + blocksize, &refs); BUG_ON(ret); /* @@ -4119,7 +3403,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, root_gen = btrfs_header_generation(parent); path->slots[*level]++; - ret = __btrfs_free_extent(trans, root, bytenr, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level - 1, 1); @@ -4165,7 +3449,7 @@ out: * cleanup and free the reference on the last node * we processed */ - ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level, 1); free_extent_buffer(path->nodes[*level]); @@ -5457,6 +4741,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, key.objectid); BUG_ON(ret); + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, leaf->start, btrfs_header_owner(leaf), @@ -5768,9 +5053,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, ref_path, NULL, NULL); BUG_ON(ret); - if (root == root->fs_info->extent_root) - btrfs_extent_post_op(trans, root); - return 0; } @@ -6208,6 +5490,9 @@ again: btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); mutex_unlock(&root->fs_info->cleaner_mutex); + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -6500,9 +5785,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, sizeof(cache->item)); BUG_ON(ret); - finish_current_insert(trans, extent_root, 0); - ret = del_pending_extents(trans, extent_root, 0); - BUG_ON(ret); set_avail_alloc_bits(extent_root->fs_info, type); return 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dc78954861b3..c80075497645 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -643,7 +643,9 @@ next_slot: if (disk_bytenr != 0) { ret = btrfs_update_extent_ref(trans, root, - disk_bytenr, orig_parent, + disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + orig_parent, leaf->start, root->root_key.objectid, trans->transid, ins.objectid); @@ -912,7 +914,7 @@ again: btrfs_set_file_extent_other_encoding(leaf, fi, 0); if (orig_parent != leaf->start) { - ret = btrfs_update_extent_ref(trans, root, bytenr, + ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes, orig_parent, leaf->start, root->root_key.objectid, trans->transid, inode->i_ino); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d638c54d39e9..f94c2ad8996c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -65,6 +65,12 @@ static noinline int join_transaction(struct btrfs_root *root) cur_trans->use_count = 1; cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root.rb_node = NULL; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.flushing = 0; + spin_lock_init(&cur_trans->delayed_refs.lock); + INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -182,6 +188,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_group = 0; h->alloc_exclude_nr = 0; h->alloc_exclude_start = 0; + h->delayed_ref_updates = 0; root->fs_info->running_transaction->use_count++; mutex_unlock(&root->fs_info->trans_mutex); return h; @@ -281,6 +288,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans; struct btrfs_fs_info *info = root->fs_info; + if (trans->delayed_ref_updates && + (trans->transaction->delayed_refs.flushing || + trans->transaction->delayed_refs.num_entries > 16384)) { + btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates); + } else if (trans->transaction->delayed_refs.num_entries > 64) { + wake_up_process(root->fs_info->transaction_kthread); + } + mutex_lock(&info->trans_mutex); cur_trans = info->running_transaction; WARN_ON(cur_trans != trans->transaction); @@ -424,9 +439,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; struct btrfs_root *tree_root = root->fs_info->tree_root; - btrfs_extent_post_op(trans, root); btrfs_write_dirty_block_groups(trans, root); - btrfs_extent_post_op(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); @@ -438,14 +454,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, btrfs_header_level(root->node)); btrfs_set_root_generation(&root->root_item, trans->transid); - btrfs_extent_post_op(trans, root); - ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); BUG_ON(ret); btrfs_write_dirty_block_groups(trans, root); - btrfs_extent_post_op(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); } return 0; } @@ -459,15 +475,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *next; struct extent_buffer *eb; + int ret; - btrfs_extent_post_op(trans, fs_info->tree_root); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); eb = btrfs_lock_root_node(fs_info->tree_root); btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); btrfs_tree_unlock(eb); free_extent_buffer(eb); - btrfs_extent_post_op(trans, fs_info->tree_root); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; @@ -475,6 +494,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); } return 0; } @@ -895,6 +917,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, DEFINE_WAIT(wait); int ret; + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); + + /* + * set the flushing flag so procs in this transaction have to + * start sending their work down. + */ + trans->transaction->delayed_refs.flushing = 1; + + ret = btrfs_run_delayed_refs(trans, root, (u64)-1); + BUG_ON(ret); + INIT_LIST_HEAD(&dirty_fs_roots); mutex_lock(&root->fs_info->trans_mutex); if (trans->transaction->in_commit) { @@ -969,6 +1006,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + WARN_ON(cur_trans != trans->transaction); /* btrfs_commit_tree_roots is responsible for getting the diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ea292117f882..94876709217f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -19,6 +19,7 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" +#include "delayed-ref.h" struct btrfs_transaction { u64 transid; @@ -34,6 +35,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; + struct btrfs_delayed_ref_root delayed_refs; }; struct btrfs_trans_handle { @@ -44,6 +46,7 @@ struct btrfs_trans_handle { u64 block_group; u64 alloc_exclude_start; u64 alloc_exclude_nr; + unsigned long delayed_ref_updates; }; struct btrfs_pending_snapshot { diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 98d25fa4570e..b10eacdb1620 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } btrfs_release_path(root, path); - if (is_extent) - btrfs_extent_post_op(trans, root); out: if (path) btrfs_free_path(path); From 44871b1b24b593996db43495cf4484cc580bdc10 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 10:04:31 -0400 Subject: [PATCH 147/478] Btrfs: reduce stack usage in some crucial tree balancing functions Many of the tree balancing functions follow the same pattern. 1) cow a block 2) do something to the result This commit breaks them up into two functions so the variables and code required for part two don't suck down stack during part one. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 576 +++++++++++++++++++++++++++-------------------- 1 file changed, 337 insertions(+), 239 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index bebc9fd16665..3764248bdc05 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2266,66 +2266,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } -/* - * push some data in the path leaf to the right, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * returns 1 if the push failed because the other node didn't have enough - * room, 0 if everything worked out and < 0 if there were major errors. - */ -static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems) { struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *right; - struct extent_buffer *upper; + struct extent_buffer *upper = path->nodes[1]; struct btrfs_disk_key disk_key; int slot; u32 i; - int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; - u32 left_nritems; u32 nr; u32 right_nritems; u32 data_end; u32 this_item_size; int ret; - slot = path->slots[1]; - if (!path->nodes[1]) - return 1; - - upper = path->nodes[1]; - if (slot >= btrfs_header_nritems(upper) - 1) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - right = read_node_slot(root, upper, slot + 1); - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); - if (ret) - goto out_unlock; - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - left_nritems = btrfs_header_nritems(left); - if (left_nritems == 0) - goto out_unlock; - if (empty) nr = 0; else @@ -2334,6 +2295,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (path->slots[0] >= left_nritems) push_space += data_size; + slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { item = btrfs_item_nr(left, i); @@ -2464,25 +2426,83 @@ out_unlock: return 1; } +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + return __push_leaf_right(trans, root, path, data_size, empty, + right, free_space, left_nritems); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise */ -static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, int right_nritems) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; - struct extent_buffer *left; int slot; int i; - int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; u32 old_left_nritems; - u32 right_nritems; u32 nr; int ret = 0; int wret; @@ -2490,41 +2510,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root u32 old_left_item_size; slot = path->slots[1]; - if (slot == 0) - return 1; - if (!path->nodes[1]) - return 1; - - right_nritems = btrfs_header_nritems(right); - if (right_nritems == 0) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - left = read_node_slot(root, path->nodes[1], slot - 1); - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); - if (ret) { - /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; - goto out; - } - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } if (empty) nr = right_nritems; @@ -2691,146 +2676,86 @@ out: return ret; } +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int free_space; + u32 right_nritems; + int ret = 0; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + return __push_leaf_left(trans, root, path, data_size, + empty, left, free_space, right_nritems); +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * * returns 0 if all went well and < 0 on failure. */ -static noinline int split_leaf(struct btrfs_trans_handle *trans, +static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *ins_key, - struct btrfs_path *path, int data_size, - int extend) + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { - struct extent_buffer *l; - u32 nritems; - int mid; - int slot; - struct extent_buffer *right; int data_copy_size; int rt_data_off; int i; int ret = 0; int wret; - int double_split; - int num_doubles = 0; struct btrfs_disk_key disk_key; - /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { - wret = push_leaf_right(trans, root, path, data_size, 0); - if (wret < 0) - return wret; - if (wret) { - wret = push_leaf_left(trans, root, path, data_size, 0); - if (wret < 0) - return wret; - } - l = path->nodes[0]; - - /* did the pushes work? */ - if (btrfs_leaf_free_space(root, l) >= data_size) - return 0; - } - - if (!path->nodes[1]) { - ret = insert_new_root(trans, root, path, 1); - if (ret) - return ret; - } -again: - double_split = 0; - l = path->nodes[0]; - slot = path->slots[0]; - nritems = btrfs_header_nritems(l); - mid = (nritems + 1) / 2; - - right = btrfs_alloc_free_block(trans, root, root->leafsize, - path->nodes[1]->start, - root->root_key.objectid, - trans->transid, 0, l->start, 0); - if (IS_ERR(right)) { - BUG_ON(1); - return PTR_ERR(right); - } - - memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(right, right->start); - btrfs_set_header_generation(right, trans->transid); - btrfs_set_header_owner(right, root->root_key.objectid); - btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); - - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), - BTRFS_UUID_SIZE); - if (mid <= slot) { - if (nritems == 1 || - leaf_space_used(l, mid, nritems - mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (slot >= nritems) { - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - path->slots[1] += 1; - btrfs_mark_buffer_dirty(right); - return ret; - } - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; - } - } - } else { - if (leaf_space_used(l, 0, mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (!extend && data_size && slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, - path->slots[1], 1); - if (wret) - ret = wret; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } - btrfs_mark_buffer_dirty(right); - return ret; - } else if ((extend || !data_size) && slot == 0) { - mid = 1; - } else { - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; - } - } - } - } nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); @@ -2896,11 +2821,160 @@ again: BUG_ON(path->slots[0] < 0); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + int ret = 0; + int wret; + int double_split; + int num_doubles = 0; + + /* first try to make some room by pushing left and right */ + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + wret = push_leaf_right(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_left(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(root, l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + double_split = 0; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + right = btrfs_alloc_free_block(trans, root, root->leafsize, + path->nodes[1]->start, + root->root_key.objectid, + trans->transid, 0, l->start, 0); + if (IS_ERR(right)) { + BUG_ON(1); + return PTR_ERR(right); + } + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); + + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + btrfs_mark_buffer_dirty(right); + return ret; + } + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(right); + return ret; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } + } + + ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); + BUG_ON(ret); + if (double_split) { BUG_ON(num_doubles != 0); num_doubles++; goto again; } + return ret; } @@ -3382,39 +3456,27 @@ out: } /* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. + * this is a helper for btrfs_insert_empty_items, the main goal here is + * to save stack depth by doing the bulk of the work in a function + * that doesn't call btrfs_search_slot */ -int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) +static noinline_for_stack int +setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { - struct extent_buffer *leaf; struct btrfs_item *item; - int ret = 0; - int slot; - int slot_orig; int i; u32 nritems; - u32 total_size = 0; - u32 total_data = 0; unsigned int data_end; struct btrfs_disk_key disk_key; + int ret; + struct extent_buffer *leaf; + int slot; - for (i = 0; i < nr; i++) - total_data += data_size[i]; - - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - slot_orig = path->slots[0]; leaf = path->nodes[0]; + slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3426,9 +3488,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, BUG(); } - slot = path->slots[0]; - BUG_ON(slot < 0); - if (slot != nritems) { unsigned int old_data = btrfs_item_end_nr(leaf, slot); @@ -3484,11 +3543,13 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, data_end -= data_size[i]; btrfs_set_item_size(leaf, item, data_size[i]); } + btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); ret = 0; if (slot == 0) { + struct btrfs_disk_key disk_key; btrfs_cpu_key_to_disk(&disk_key, cpu_key); ret = fixup_low_keys(trans, root, path, &disk_key, 1); } @@ -3497,6 +3558,43 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + int ret = 0; + int slot; + int i; + u32 total_size = 0; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + BUG_ON(slot < 0); + + ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + total_data, total_size, nr); + out: btrfs_unlock_up_safe(path, 1); return ret; From 1887be66dcc3140a81d1299958a41fc0eedfa64f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 10:11:24 -0400 Subject: [PATCH 148/478] Btrfs: try to cleanup delayed refs while freeing extents When extents are freed, it is likely that we've removed the last delayed reference update for the extent. This checks the delayed ref tree when things are freed, and if no ref updates area left it immediately processes the delayed ref. Signed-off-by: Chris Mason --- fs/btrfs/delayed-ref.c | 18 ++++++++++++ fs/btrfs/delayed-ref.h | 5 ++-- fs/btrfs/extent-tree.c | 67 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 874565a1f634..3e7eeaf86408 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -510,6 +510,24 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, return 0; } +/* + * this does a simple search for the head node for a given extent. + * It must be called with the delayed ref spinlock held, and it returns + * the head node if any where found, or NULL if not. + */ +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + if (ref) + return btrfs_delayed_node_to_head(ref); + return NULL; +} + /* * add a delayed ref to the tree. This does all of the accounting required * to make sure the delayed ref is eventually processed before this diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 37919e5c007f..c345fee9f96b 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -137,9 +137,8 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, u64 ref_generation, u64 owner_objectid, int action, int pin); -struct btrfs_delayed_ref * -btrfs_find_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, - u64 parent); +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9b5da2b013e4..8471c79b0877 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1021,6 +1021,7 @@ again: if (!locked_ref && count == 0) break; + cond_resched(); spin_lock(&delayed_refs->lock); } if (run_all) { @@ -1045,6 +1046,7 @@ again: mutex_unlock(&head->mutex); btrfs_put_delayed_ref(ref); + cond_resched(); goto again; } node = rb_next(node); @@ -2361,6 +2363,68 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, owner_objectid, pin, pin == 0, refs_to_drop); } +/* + * when we free an extent, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. + */ +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct rb_node *node; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; + + node = rb_prev(&head->node.rb_node); + if (!node) + goto out; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + /* there are still entries for this ref, we can't drop it */ + if (ref->bytenr == bytenr) + goto out; + + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; + + /* + * at this point we have a head with no other entries. Go + * ahead and process it. + */ + head->node.in_tree = 0; + rb_erase(&head->node.rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + /* + * we don't take a ref on the node because we're removing it from the + * tree, so we just steal the ref the tree was holding. + */ + spin_unlock(&delayed_refs->lock); + + ret = run_one_delayed_ref(trans, root->fs_info->tree_root, + &head->node, head->must_insert_reserved); + BUG_ON(ret); + btrfs_put_delayed_ref(&head->node); + return 0; +out: + spin_unlock(&delayed_refs->lock); + return 0; +} + int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -2388,6 +2452,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, root_objectid, ref_generation, owner_objectid, BTRFS_DROP_DELAYED_REF, 1); + BUG_ON(ret); + ret = check_ref_cleanup(trans, root, bytenr); + BUG_ON(ret); } return ret; } From c3e69d58e86c3917ae4e9e31b4acf490a7cafe60 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 10:17:05 -0400 Subject: [PATCH 149/478] Btrfs: process the delayed reference queue in clusters The delayed reference queue maintains pending operations that need to be done to the extent allocation tree. These are processed by finding records in the tree that are not currently being processed one at a time. This is slow because it uses lots of time searching through the rbtree and because it creates lock contention on the extent allocation tree when lots of different procs are running delayed refs at the same time. This commit changes things to grab a cluster of refs for processing, using a cursor into the rbtree as the starting point of the next search. This way we walk smoothly through the rbtree. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/delayed-ref.c | 134 +++++++++++++++++++++++++---------- fs/btrfs/delayed-ref.h | 17 ++++- fs/btrfs/disk-io.c | 17 ----- fs/btrfs/extent-tree.c | 155 +++++++++++++++++++++++++++-------------- fs/btrfs/transaction.c | 23 ++++-- 6 files changed, 230 insertions(+), 117 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ced5fd85dc36..08d9f8d15538 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -720,6 +720,7 @@ struct btrfs_fs_info { struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 3e7eeaf86408..759fa247ced8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -93,7 +93,8 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, * ref if it was able to find one, or NULL if nothing was in that spot */ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, - u64 bytenr, u64 parent) + u64 bytenr, u64 parent, + struct btrfs_delayed_ref_node **last) { struct rb_node *n = root->rb_node; struct btrfs_delayed_ref_node *entry; @@ -102,6 +103,8 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); + if (last) + *last = entry; cmp = comp_entry(entry, bytenr, parent); if (cmp < 0) @@ -114,45 +117,99 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, return NULL; } -/* - * Locking on delayed refs is done by taking a lock on the head node, - * which has the (impossible) parent id of (u64)-1. Once a lock is held - * on the head node, you're allowed (and required) to process all the - * delayed refs for a given byte number in the tree. - * - * This will walk forward in the rbtree until it finds a head node it - * is able to lock. It might not lock the delayed ref you asked for, - * and so it will return the one it did lock in next_ret and return 0. - * - * If no locks are taken, next_ret is set to null and 1 is returned. This - * means there are no more unlocked head nodes in the rbtree. - */ -int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - struct btrfs_delayed_ref_head **next_ret) +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_head *head; - int ret = 0; + struct btrfs_delayed_ref_root *delayed_refs; - while (1) { + delayed_refs = &trans->transaction->delayed_refs; + assert_spin_locked(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (!head->node.in_tree) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + btrfs_put_delayed_ref(&head->node); + return 0; +} + +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 start) +{ + int count = 0; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + + delayed_refs = &trans->transaction->delayed_refs; + if (start == 0) { + node = rb_first(&delayed_refs->root); + } else { + ref = NULL; + tree_search(&delayed_refs->root, start, (u64)-1, &ref); + if (ref) { + struct btrfs_delayed_ref_node *tmp; + + node = rb_prev(&ref->rb_node); + while (node) { + tmp = rb_entry(node, + struct btrfs_delayed_ref_node, + rb_node); + if (tmp->bytenr < start) + break; + ref = tmp; + node = rb_prev(&ref->rb_node); + } + node = &ref->rb_node; + } else + node = rb_first(&delayed_refs->root); + } +again: + while (node && count < 32) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); if (btrfs_delayed_ref_is_head(ref)) { head = btrfs_delayed_node_to_head(ref); - if (mutex_trylock(&head->mutex)) { - *next_ret = head; - ret = 0; + if (list_empty(&head->cluster)) { + list_add_tail(&head->cluster, cluster); + delayed_refs->run_delayed_start = + head->node.bytenr; + count++; + + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + } else if (count) { + /* the goal of the clustering is to find extents + * that are likely to end up in the same extent + * leaf on disk. So, we don't want them spread + * all over the tree. Stop now if we've hit + * a head that was already in use + */ break; } } - node = rb_next(&ref->rb_node); - if (!node) { - ret = 1; - *next_ret = NULL; - break; - } - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); } - return ret; + if (count) { + return 0; + } else if (start) { + /* + * we've gone to the end of the rbtree without finding any + * clusters. start from the beginning and try again + */ + start = 0; + node = rb_first(&delayed_refs->root); + goto again; + } + return 1; } /* @@ -178,7 +235,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) { prev_node = rb_prev(&ref->rb_node); if (!prev_node) @@ -240,7 +297,7 @@ again: } spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) { head = btrfs_delayed_node_to_head(ref); if (mutex_trylock(&head->mutex)) { @@ -384,7 +441,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_ref *full_ref; - struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; @@ -428,6 +485,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, if (btrfs_delayed_ref_is_head(ref)) { head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; + INIT_LIST_HEAD(&head_ref->cluster); mutex_init(&head_ref->mutex); } else { full_ref = btrfs_delayed_node_to_ref(ref); @@ -453,6 +511,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, */ kfree(ref); } else { + if (btrfs_delayed_ref_is_head(ref)) { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + } delayed_refs->num_entries++; trans->delayed_ref_updates++; } @@ -522,7 +584,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c345fee9f96b..57153fcc347b 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -68,6 +68,8 @@ struct btrfs_delayed_ref_head { */ struct mutex mutex; + struct list_head cluster; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -115,12 +117,20 @@ struct btrfs_delayed_ref_root { */ unsigned long num_entries; + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need * to be run right away */ int flushing; + + u64 run_delayed_start; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -140,9 +150,6 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - struct btrfs_delayed_ref_head **next_ret); int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u32 *refs); @@ -151,6 +158,10 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 parent, u64 orig_ref_root, u64 ref_root, u64 orig_ref_generation, u64 ref_generation, u64 owner_objectid, int pin); +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head); +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 search_start); /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4f43e227a297..1f1d89b18818 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1458,7 +1458,6 @@ static int transaction_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; - struct btrfs_fs_info *info = root->fs_info; unsigned long now; unsigned long delay; int ret; @@ -1481,24 +1480,8 @@ static int transaction_kthread(void *arg) now = get_seconds(); if (now < cur->start_time || now - cur->start_time < 30) { - unsigned long num_delayed; - num_delayed = cur->delayed_refs.num_entries; mutex_unlock(&root->fs_info->trans_mutex); delay = HZ * 5; - - /* - * we may have been woken up early to start - * processing the delayed extent ref updates - * If so, run some of them and then loop around again - * to see if we need to force a commit - */ - if (num_delayed > 64) { - mutex_unlock(&info->transaction_kthread_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_run_delayed_refs(trans, root, 256); - btrfs_end_transaction(trans, root); - continue; - } goto sleep; } mutex_unlock(&root->fs_info->trans_mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8471c79b0877..3b8b6c212701 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -926,52 +926,41 @@ again: return NULL; } -/* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. - */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count) +static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct list_head *cluster) { - struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; + int count = 0; int must_insert_reserved = 0; - int run_all = count == (unsigned long)-1; - - if (root == root->fs_info->extent_root) - root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; -again: - spin_lock(&delayed_refs->lock); - if (count == 0) - count = delayed_refs->num_entries; while (1) { if (!locked_ref) { - /* - * no locked ref, go find something we can - * process in the rbtree. We start at - * the beginning of the tree, there may be less - * lock contention if we do something smarter here. - */ - node = rb_first(&delayed_refs->root); - if (!node) { - spin_unlock(&delayed_refs->lock); + /* pick a new head ref from the cluster list */ + if (list_empty(cluster)) break; - } - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref); - if (ret) { - spin_unlock(&delayed_refs->lock); - break; + locked_ref = list_entry(cluster->next, + struct btrfs_delayed_ref_head, cluster); + + /* grab the lock that says we are going to process + * all the refs for this head */ + ret = btrfs_delayed_ref_lock(trans, locked_ref); + + /* + * we may have dropped the spin lock to get the head + * mutex lock, and that might have given someone else + * time to free the head. If that's true, it has been + * removed from our list and we can move on. + */ + if (ret == -EAGAIN) { + locked_ref = NULL; + count++; + continue; } } @@ -986,7 +975,6 @@ again: * locked_ref is the head node, so we have to go one * node back for any delayed ref updates */ - ref = select_delayed_ref(locked_ref); if (!ref) { /* All delayed refs have been processed, Go ahead @@ -994,6 +982,7 @@ again: * so that any accounting fixes can happen */ ref = &locked_ref->node; + list_del_init(&locked_ref->cluster); locked_ref = NULL; } @@ -1007,30 +996,72 @@ again: BUG_ON(ret); btrfs_put_delayed_ref(ref); - /* once we lock the head ref, we have to process all the - * entries for it. So, we might end up doing more entries - * that count was asking us to do. - */ - if (count > 0) - count--; - - /* - * we set locked_ref to null above if we're all done - * with this bytenr - */ - if (!locked_ref && count == 0) - break; - + count++; cond_resched(); spin_lock(&delayed_refs->lock); } + return count; +} + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct list_head cluster; + int ret; + int run_all = count == (unsigned long)-1; + int run_most = 0; + + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; + + delayed_refs = &trans->transaction->delayed_refs; + INIT_LIST_HEAD(&cluster); +again: + spin_lock(&delayed_refs->lock); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + while (1) { + if (!(run_all || run_most) && + delayed_refs->num_heads_ready < 64) + break; + + /* + * go find something we can process in the rbtree. We start at + * the beginning of the tree, and then build a cluster + * of refs to process starting at the first one we are able to + * lock + */ + ret = btrfs_find_ref_cluster(trans, &cluster, + delayed_refs->run_delayed_start); + if (ret) + break; + + ret = run_clustered_refs(trans, root, &cluster); + BUG_ON(ret < 0); + + count -= min_t(unsigned long, ret, count); + + if (count == 0) + break; + } + if (run_all) { - spin_lock(&delayed_refs->lock); node = rb_first(&delayed_refs->root); - if (!node) { - spin_unlock(&delayed_refs->lock); + if (!node) goto out; - } + count = (unsigned long)-1; while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, @@ -1052,11 +1083,11 @@ again: node = rb_next(node); } spin_unlock(&delayed_refs->lock); - count = (unsigned long)-1; schedule_timeout(1); goto again; } out: + spin_unlock(&delayed_refs->lock); return 0; } @@ -2407,12 +2438,18 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, */ head->node.in_tree = 0; rb_erase(&head->node.rb_node, &delayed_refs->root); + delayed_refs->num_entries--; /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + + list_del_init(&head->cluster); spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root->fs_info->tree_root, @@ -3705,6 +3742,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_path *path; int i; int orig_level; + int update_count; struct btrfs_root_item *root_item = &root->root_item; WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); @@ -3746,6 +3784,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root } } while (1) { + unsigned long update; wret = walk_down_tree(trans, root, path, &level); if (wret > 0) break; @@ -3764,6 +3803,14 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root } atomic_inc(&root->fs_info->throttle_gen); wake_up(&root->fs_info->transaction_throttle); + for (update_count = 0; update_count < 16; update_count++) { + update = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (update) + btrfs_run_delayed_refs(trans, root, update); + else + break; + } } for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f94c2ad8996c..903edab3659a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -68,7 +68,10 @@ static noinline int join_transaction(struct btrfs_root *root) cur_trans->delayed_refs.root.rb_node = NULL; cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); @@ -287,13 +290,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *info = root->fs_info; + int count = 0; - if (trans->delayed_ref_updates && - (trans->transaction->delayed_refs.flushing || - trans->transaction->delayed_refs.num_entries > 16384)) { - btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates); - } else if (trans->transaction->delayed_refs.num_entries > 64) { - wake_up_process(root->fs_info->transaction_kthread); + while (count < 4) { + unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (cur && + trans->transaction->delayed_refs.num_heads_ready > 64) { + trans->delayed_ref_updates = 0; + btrfs_run_delayed_refs(trans, root, cur); + } else { + break; + } + count++; } mutex_lock(&info->trans_mutex); @@ -929,7 +938,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ trans->transaction->delayed_refs.flushing = 1; - ret = btrfs_run_delayed_refs(trans, root, (u64)-1); + ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); INIT_LIST_HEAD(&dirty_fs_roots); From b7ec40d7845bffca8bb3af2ea3f192d6257bbe21 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Mar 2009 20:12:45 -0400 Subject: [PATCH 150/478] Btrfs: reduce stalls during transaction commit To avoid deadlocks and reduce latencies during some critical operations, some transaction writers are allowed to jump into the running transaction and make it run a little longer, while others sit around and wait for the commit to finish. This is a bit unfair, especially when the callers that jump in do a bunch of IO that makes all the others procs on the box wait. This commit reduces the stalls this produces by pre-reading file extent pointers during btrfs_finish_ordered_io before the transaction is joined. It also tunes the drop_snapshot code to politely wait for transactions that have started writing out their delayed refs to finish. This avoids new delayed refs being flooded into the queue while we're trying to close off the transaction. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 +- fs/btrfs/inode.c | 18 ++++++++++++ fs/btrfs/transaction.c | 62 +++++++++++++++++++++++++++++++++++++----- fs/btrfs/transaction.h | 5 ++++ 4 files changed, 80 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3b8b6c212701..a421c32c6cfe 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3797,7 +3797,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root break; if (wret < 0) ret = wret; - if (trans->transaction->in_commit) { + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) { ret = -EAGAIN; break; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..13a17477c4f4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1502,6 +1502,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered_extent; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; int compressed = 0; int ret; @@ -1509,6 +1510,23 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) if (!ret) return 0; + /* + * before we join the transaction, try to do some of our IO. + * This will limit the amount of IO that we have to do with + * the transaction running. We're unlikely to need to do any + * IO if the file extents are new, the disk_i_size checks + * covers the most common case. + */ + if (start < BTRFS_I(inode)->disk_i_size) { + path = btrfs_alloc_path(); + if (path) { + ret = btrfs_lookup_file_extent(NULL, root, path, + inode->i_ino, + start, 0); + btrfs_free_path(path); + } + } + trans = btrfs_join_transaction(root, 1); ordered_extent = btrfs_lookup_ordered_extent(inode, start); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 903edab3659a..01c9620bb001 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -192,6 +192,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_nr = 0; h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; + root->fs_info->running_transaction->use_count++; mutex_unlock(&root->fs_info->trans_mutex); return h; @@ -281,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root) if (!root->fs_info->open_ioctl_trans) wait_current_trans(root); mutex_unlock(&root->fs_info->trans_mutex); - throttle_on_drops(root); } @@ -298,6 +298,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (cur && trans->transaction->delayed_refs.num_heads_ready > 64) { trans->delayed_ref_updates = 0; + + /* + * do a full flush if the transaction is trying + * to close + */ + if (trans->transaction->delayed_refs.flushing) + cur = 0; btrfs_run_delayed_refs(trans, root, cur); } else { break; @@ -665,6 +672,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) return 0; } +/* + * when dropping snapshots, we generate a ton of delayed refs, and it makes + * sense not to join the transaction while it is trying to flush the current + * queue of delayed refs out. + * + * This is used by the drop snapshot code only + */ +static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) +{ + DEFINE_WAIT(wait); + + mutex_lock(&info->trans_mutex); + while (info->running_transaction && + info->running_transaction->delayed_refs.flushing) { + prepare_to_wait(&info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&info->trans_mutex); + schedule(); + mutex_lock(&info->trans_mutex); + finish_wait(&info->transaction_wait, &wait); + } + mutex_unlock(&info->trans_mutex); + return 0; +} + /* * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on * all of them @@ -692,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, atomic_inc(&root->fs_info->throttles); while (1) { + /* + * we don't want to jump in and create a bunch of + * delayed refs if the transaction is starting to close + */ + wait_transaction_pre_flush(tree_root->fs_info); trans = btrfs_start_transaction(tree_root, 1); + + /* + * we've joined a transaction, make sure it isn't + * closing right now + */ + if (trans->transaction->delayed_refs.flushing) { + btrfs_end_transaction(trans, tree_root); + continue; + } + mutex_lock(&root->fs_info->drop_mutex); ret = btrfs_drop_snapshot(trans, dirty->root); if (ret != -EAGAIN) @@ -932,20 +979,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); + cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to * start sending their work down. */ - trans->transaction->delayed_refs.flushing = 1; + cur_trans->delayed_refs.flushing = 1; ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - INIT_LIST_HEAD(&dirty_fs_roots); mutex_lock(&root->fs_info->trans_mutex); - if (trans->transaction->in_commit) { - cur_trans = trans->transaction; - trans->transaction->use_count++; + INIT_LIST_HEAD(&dirty_fs_roots); + if (cur_trans->in_commit) { + cur_trans->use_count++; mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -968,7 +1015,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->in_commit = 1; trans->transaction->blocked = 1; - cur_trans = trans->transaction; if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); @@ -1081,6 +1127,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_copy_pinned(root, pinned_copy); trans->transaction->blocked = 0; + wake_up(&root->fs_info->transaction_throttle); wake_up(&root->fs_info->transaction_wait); @@ -1107,6 +1154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->trans_mutex); cur_trans->commit_done = 1; + root->fs_info->last_trans_committed = cur_trans->transid; wake_up(&cur_trans->commit_wait); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 94876709217f..94f5bde2b58d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -23,7 +23,12 @@ struct btrfs_transaction { u64 transid; + /* + * total writers in this transaction, it must be zero before the + * transaction can end + */ unsigned long num_writers; + unsigned long num_joined; int in_commit; int use_count; From 7f366cfecfc126731f8ac505d72026d691dac79a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Mar 2009 20:12:45 -0400 Subject: [PATCH 151/478] Btrfs: reduce stack in cow_file_range The fs/btrfs/inode.c code to run delayed allocation during writout needed some stack usage optimization. This is the first pass, it does the check for compression earlier on, which allows us to do the common (no compression) case higher up in the call chain. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13a17477c4f4..c427011dc453 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -204,7 +204,7 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static int cow_file_range_inline(struct btrfs_trans_handle *trans, +static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, size_t compressed_size, @@ -854,11 +854,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - if (!btrfs_test_opt(root, COMPRESS)) { - return cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); - } - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC, 1, 0, GFP_NOFS); while (start < end) { @@ -935,7 +930,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root, * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, +static noinline int run_delalloc_nocow(struct inode *inode, + struct page *locked_page, u64 start, u64 end, int *page_started, int force, unsigned long *nr_written) { @@ -1133,6 +1129,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_test_flag(inode, NODATACOW)) ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1140,10 +1137,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, else if (btrfs_test_flag(inode, PREALLOC)) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); + else if (!btrfs_test_opt(root, COMPRESS)) + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); else ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); - return ret; } From 66d7e85ea7c3628189d19b265495358f756cb463 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Mar 2009 20:12:45 -0400 Subject: [PATCH 152/478] Btrfs: Check for a blocking lock before taking the spin This reduces contention on the extent buffer spin locks by testing for a blocking lock before trying to take the spinlock. Signed-off-by: Chris Mason --- fs/btrfs/locking.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 47b0a88c12a2..6d8db2f5c38d 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) static int btrfs_spin_on_block(struct extent_buffer *eb) { int i; + for (i = 0; i < 512; i++) { - cpu_relax(); if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) return 1; if (need_resched()) break; + cpu_relax(); } return 0; } @@ -102,6 +103,7 @@ int btrfs_try_spin_lock(struct extent_buffer *eb) /* spin for a bit on the BLOCKING flag */ for (i = 0; i < 2; i++) { + cpu_relax(); if (!btrfs_spin_on_block(eb)) break; @@ -148,6 +150,9 @@ int btrfs_tree_lock(struct extent_buffer *eb) DEFINE_WAIT(wait); wait.func = btrfs_wake_function; + if (!btrfs_spin_on_block(eb)) + goto sleep; + while(1) { spin_nested(eb); @@ -165,9 +170,10 @@ int btrfs_tree_lock(struct extent_buffer *eb) * spin for a bit, and if the blocking flag goes away, * loop around */ + cpu_relax(); if (btrfs_spin_on_block(eb)) continue; - +sleep: prepare_to_wait_exclusive(&eb->lock_wq, &wait, TASK_UNINTERRUPTIBLE); From 89573b9c516b24af8a3b9958dd5afca8fa874e3d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Mar 2009 20:12:45 -0400 Subject: [PATCH 153/478] Btrfs: Only let very young transactions grow during commit Commits are fairly expensive, and so btrfs has code to sit around for a while during the commit and let new writers come in. But, while we're sitting there, new delayed refs might be added, and those can be expensive to process as well. Unless the transaction is very very young, it makes sense to go ahead and let the commit finish without hanging around. The commit grow loop isn't as important as it used to be, the fsync logging code handles most performance critical syncs now. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 01c9620bb001..9c8f158dd2db 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -972,6 +972,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; + int should_grow = 0; + unsigned long now = get_seconds(); /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here @@ -1029,6 +1031,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } } + if (now < cur_trans->start_time || now - cur_trans->start_time < 1) + should_grow = 1; + do { int snap_pending = 0; joined = cur_trans->num_joined; @@ -1041,7 +1046,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (cur_trans->num_writers > 1) timeout = MAX_SCHEDULE_TIMEOUT; - else + else if (should_grow) timeout = 1; mutex_unlock(&root->fs_info->trans_mutex); @@ -1051,12 +1056,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); } - schedule_timeout(timeout); + smp_mb(); + if (cur_trans->num_writers > 1 || should_grow) + schedule_timeout(timeout); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); } while (cur_trans->num_writers > 1 || - (cur_trans->num_joined != joined)); + (should_grow && cur_trans->num_joined != joined)); ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); From b9473439d3e84d9fc1a0a83faca69cc1b7566341 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 11:00:37 -0400 Subject: [PATCH 154/478] Btrfs: leave btree locks spinning more often btrfs_mark_buffer dirty would set dirty bits in the extent_io tree for the buffers it was dirtying. This may require a kmalloc and it was not atomic. So, anyone who called btrfs_mark_buffer_dirty had to set any btree locks they were holding to blocking first. This commit changes dirty tracking for extent buffers to just use a flag in the extent buffer. Now that we have one and only one extent buffer per page, this can be safely done without losing dirty bits along the way. This also introduces a path->leave_spinning flag that callers of btrfs_search_slot can use to indicate they will properly deal with a path returned where all the locks are spinning instead of blocking. Many of the btree search callers now expect spinning paths, resulting in better btree concurrency overall. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 19 +++++++----- fs/btrfs/ctree.h | 12 ++++++-- fs/btrfs/dir-item.c | 3 ++ fs/btrfs/disk-io.c | 69 +++++++++++++++++++++++++++++++++--------- fs/btrfs/disk-io.h | 1 + fs/btrfs/extent-tree.c | 69 ++++++++++++++++++++++++++++++------------ fs/btrfs/extent_io.c | 51 ++++++------------------------- fs/btrfs/extent_io.h | 3 ++ fs/btrfs/file-item.c | 7 +++-- fs/btrfs/file.c | 4 +++ fs/btrfs/inode-item.c | 3 ++ fs/btrfs/inode.c | 17 +++++++++-- fs/btrfs/locking.c | 11 ++++--- fs/btrfs/tree-log.c | 1 - 14 files changed, 173 insertions(+), 97 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3764248bdc05..8686a3d2ab3a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1684,7 +1684,8 @@ done: * we don't really know what they plan on doing with the path * from here on, so for now just mark it as blocking */ - btrfs_set_path_blocking(p); + if (!p->leave_spinning) + btrfs_set_path_blocking(p); return ret; } @@ -3032,26 +3033,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, return -EAGAIN; } + btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &orig_key, path, sizeof(struct btrfs_item), 1); path->keep_locks = 0; BUG_ON(ret); + btrfs_unlock_up_safe(path, 1); + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +split: /* * make sure any changes to the path from split_leaf leave it * in a blocking state */ btrfs_set_path_blocking(path); - leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); - -split: item = btrfs_item_nr(leaf, path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); - buf = kmalloc(item_size, GFP_NOFS); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, path->slots[0]), item_size); @@ -3545,7 +3547,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, } btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); ret = 0; if (slot == 0) { @@ -3553,6 +3554,8 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key); ret = fixup_low_keys(trans, root, path, &disk_key, 1); } + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) < 0) { btrfs_print_leaf(root, leaf); @@ -3596,7 +3599,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, total_data, total_size, nr); out: - btrfs_unlock_up_safe(path, 1); return ret; } @@ -3792,6 +3794,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, slot = path->slots[1]; extent_buffer_get(leaf); + btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 08d9f8d15538..4ddce91cf3f9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -401,15 +401,16 @@ struct btrfs_path { int locks[BTRFS_MAX_LEVEL]; int reada; /* keep some upper locks as we walk down */ - int keep_locks; - int skip_locking; int lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ - int search_for_split; + unsigned int search_for_split:1; + unsigned int keep_locks:1; + unsigned int skip_locking:1; + unsigned int leave_spinning:1; }; /* @@ -779,6 +780,11 @@ struct btrfs_fs_info { atomic_t throttle_gen; u64 total_pinned; + + /* protected by the delalloc lock, used to keep from writing + * metadata until there is a nice batch + */ + u64 dirty_metadata_bytes; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 926a0b287a7d..1d70236ba00c 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root key.objectid = dir; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + path->leave_spinning = 1; + data_size = sizeof(*dir_item) + name_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1f1d89b18818..9244cd7313d4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct extent_buffer *eb; + int was_dirty; - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (!(current->flags & PF_MEMALLOC)) { + return extent_write_full_page(tree, page, + btree_get_extent, wbc); } - return extent_write_full_page(tree, page, btree_get_extent, wbc); + + redirty_page_for_writepage(wbc, page); + eb = btrfs_find_tree_block(root, page_offset(page), + PAGE_CACHE_SIZE); + WARN_ON(!eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; + spin_unlock(&root->fs_info->delalloc_lock); + } + free_extent_buffer(eb); + + unlock_page(page); + return 0; } static int btree_writepages(struct address_space *mapping, @@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping, struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_root *root = BTRFS_I(mapping->host)->root; u64 num_dirty; - u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; - num_dirty = count_range_bits(tree, &start, (u64)-1, - thresh, EXTENT_DIRTY); + /* this is a bit racy, but that's ok */ + num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty < thresh) return 0; } @@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); - /* ugh, clear_extent_buffer_dirty can be expensive */ - btrfs_set_lock_blocking(buf); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= buf->len) + root->fs_info->dirty_metadata_bytes -= buf->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -2348,8 +2373,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; - - btrfs_set_lock_blocking(buf); + int was_dirty; btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) { @@ -2360,7 +2384,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); + was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += buf->len; + spin_unlock(&root->fs_info->delalloc_lock); + } } void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) @@ -2400,6 +2430,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int btree_lock_page_hook(struct page *page) { struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; unsigned long len; @@ -2415,6 +2446,16 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_lock(eb); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= eb->len) + root->fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + btrfs_tree_unlock(eb); free_extent_buffer(eb); out: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 95029db227be..c958ecbc1916 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root, void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int wait_on_tree_block_writeback(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a421c32c6cfe..8933d15a240f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -56,9 +56,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, int ref_mod); static int update_reserved_extents(struct btrfs_root *root, u64 bytenr, u64 num, int reserve); -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int is_data); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, @@ -618,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, } else { goto out; } + btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(path->nodes[0]); out: btrfs_release_path(root, path); @@ -760,6 +758,7 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = 1; + path->leave_spinning = 1; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -767,8 +766,10 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, /* first find the extent item and update its reference count */ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 1); - if (ret < 0) + if (ret < 0) { + btrfs_set_path_blocking(path); return ret; + } if (ret > 0) { WARN_ON(1); @@ -791,11 +792,15 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(l, item); btrfs_set_extent_refs(l, item, refs + refs_to_add); + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(root->fs_info->extent_root, path); path->reada = 1; + path->leave_spinning = 1; + /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, @@ -2050,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } + mutex_unlock(&root->fs_info->pinned_mutex); + while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); @@ -2141,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { + mutex_lock(&root->fs_info->pinned_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); if (ret) @@ -2150,14 +2157,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start); + /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - if (need_resched()) { - mutex_unlock(&root->fs_info->pinned_mutex); - cond_resched(); - mutex_lock(&root->fs_info->pinned_mutex); - } + cond_resched(); } mutex_unlock(&root->fs_info->pinned_mutex); return ret; @@ -2165,7 +2169,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int is_data) + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, int is_data, + struct extent_buffer **must_clean) { int err = 0; struct extent_buffer *buf; @@ -2191,15 +2197,16 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - clean_tree_block(NULL, root, buf); - btrfs_tree_unlock(buf); - free_extent_buffer(buf); + *must_clean = buf; return 1; } btrfs_tree_unlock(buf); } free_extent_buffer(buf); pinit: + btrfs_set_path_blocking(path); + mutex_lock(&root->fs_info->pinned_mutex); + /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); BUG_ON(err < 0); @@ -2236,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = 1; + path->leave_spinning = 1; ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent, root_objectid, ref_generation, owner_objectid, 1); @@ -2261,6 +2269,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, refs_to_drop); BUG_ON(ret); btrfs_release_path(extent_root, path); + path->leave_spinning = 1; ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { @@ -2318,6 +2327,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, /* if refs are 0, we need to setup the path for deletion */ if (refs == 0) { btrfs_release_path(extent_root, path); + path->leave_spinning = 1; ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); BUG_ON(ret); @@ -2327,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans, if (refs == 0) { u64 super_used; u64 root_used; + struct extent_buffer *must_clean = NULL; if (pin) { - mutex_lock(&root->fs_info->pinned_mutex); - ret = pin_down_bytes(trans, root, bytenr, num_bytes, - owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); - mutex_unlock(&root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, root, path, + bytenr, num_bytes, + owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, + &must_clean); if (ret > 0) mark_free = 1; BUG_ON(ret < 0); } + /* block accounting for super block */ spin_lock(&info->delalloc_lock); super_used = btrfs_super_bytes_used(&info->super_copy); @@ -2348,11 +2360,27 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, root_used - num_bytes); spin_unlock(&info->delalloc_lock); + + /* + * it is going to be very rare for someone to be waiting + * on the block we're freeing. del_items might need to + * schedule, so rather than get fancy, just force it + * to blocking here + */ + if (must_clean) + btrfs_set_lock_blocking(must_clean); + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); @@ -2480,8 +2508,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { mutex_lock(&root->fs_info->pinned_mutex); + + /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - mutex_unlock(&root->fs_info->pinned_mutex); update_reserved_extents(root, bytenr, num_bytes, 0); ret = 0; } else { @@ -2931,6 +2960,7 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, extent_root, path, keys, sizes, 2); BUG_ON(ret); @@ -5435,6 +5465,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) goto out; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ebe6b29e6069..08085af089e2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb) int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb) { - int set; unsigned long i; unsigned long num_pages; struct page *page; - u64 start = eb->start; - u64 end = start + eb->len - 1; - - set = clear_extent_dirty(tree, start, end, GFP_NOFS); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!set && !PageDirty(page)) + if (!PageDirty(page)) continue; lock_page(page); @@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, else set_page_private(page, EXTENT_PAGE_PRIVATE); - /* - * if we're on the last page or the first page and the - * block isn't aligned on a page boundary, do extra checks - * to make sure we don't clean page that is partially dirty - */ - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - start = (u64)page->index << PAGE_CACHE_SHIFT; - end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, - EXTENT_DIRTY, 0)) { - unlock_page(page); - continue; - } - } clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, { unsigned long i; unsigned long num_pages; + int was_dirty = 0; + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *page = extent_buffer_page(eb, i); - /* writepage may need to do something special for the - * first page, we have to make sure page->private is - * properly set. releasepage may drop page->private - * on us if the page isn't already dirty. - */ - lock_page(page); - if (i == 0) { - set_page_extent_head(page, eb->len); - } else if (PagePrivate(page) && - page->private != EXTENT_PAGE_PRIVATE) { - set_page_extent_mapped(page); - } + for (i = 0; i < num_pages; i++) __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - set_extent_dirty(tree, page_offset(page), - page_offset(page) + PAGE_CACHE_SIZE - 1, - GFP_NOFS); - unlock_page(page); - } - return 0; + return was_dirty; } int clear_extent_buffer_uptodate(struct extent_io_tree *tree, @@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ret = 0; goto out; } + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + ret = 0; + goto out; + } /* at this point we can safely release the extent buffer */ num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1f9df88afbf6..5bc20abf3f3d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -25,6 +25,7 @@ /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 #define EXTENT_BUFFER_BLOCKING 1 +#define EXTENT_BUFFER_DIRTY 2 /* * page->private values. Every page that is controlled by the extent @@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); +int test_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_io_tree *tree, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 964652435fd1..9b99886562d0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, file_key.offset = pos; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) @@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { if (path->slots[0] == 0) @@ -757,8 +759,10 @@ insert: } else { ins_size = csum_size; } + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); + path->leave_spinning = 0; if (ret < 0) goto fail_unlock; if (ret != 0) { @@ -776,7 +780,6 @@ found: item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + btrfs_item_size_nr(leaf, path->slots[0])); eb_token = NULL; - cond_resched(); next_sector: if (!eb_token || @@ -817,9 +820,9 @@ next_sector: eb_token = NULL; } btrfs_mark_buffer_dirty(path->nodes[0]); - cond_resched(); if (total_bytes < sums->len) { btrfs_release_path(root, path); + cond_resched(); goto again; } out: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c80075497645..f06c275644b7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -606,6 +606,7 @@ next_slot: btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); btrfs_release_path(root, path); + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*extent)); BUG_ON(ret); @@ -639,7 +640,9 @@ next_slot: ram_bytes); btrfs_set_file_extent_type(leaf, extent, found_type); + btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_set_lock_blocking(path->nodes[0]); if (disk_bytenr != 0) { ret = btrfs_update_extent_ref(trans, root, @@ -652,6 +655,7 @@ next_slot: BUG_ON(ret); } + path->leave_spinning = 0; btrfs_release_path(root, path); if (disk_bytenr != 0) inode_add_bytes(inode, extent_end - end); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 3d46fa1f29a4..6b627c611808 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; @@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c427011dc453..b83a45dc717e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + path->leave_spinning = 1; btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; @@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap(cpage); + kaddr = kmap_atomic(cpage, KM_USER0); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap(cpage); + kunmap_atomic(kaddr, KM_USER0); i++; ptr += cur_size; @@ -1452,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, file_pos, &hint); BUG_ON(ret); @@ -1474,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, fi, compression); btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + + btrfs_unlock_up_safe(path, 1); + btrfs_set_lock_blocking(leaf); + btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); @@ -1486,8 +1492,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, inode->i_ino, &ins); BUG_ON(ret); - btrfs_free_path(path); + return 0; } @@ -2118,6 +2124,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 1); if (ret) { @@ -2164,6 +2171,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } + path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2515,6 +2523,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto error; @@ -2661,6 +2670,7 @@ delete: break; } if (found_extent) { + btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, leaf->start, root_owner, @@ -3466,6 +3476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); sizes[1] = name_len + sizeof(*ref); + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); if (ret != 0) goto fail; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6d8db2f5c38d..a5310c0f41e2 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -96,11 +96,12 @@ int btrfs_try_spin_lock(struct extent_buffer *eb) { int i; - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - + if (btrfs_spin_on_block(eb)) { + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } /* spin for a bit on the BLOCKING flag */ for (i = 0; i < 2; i++) { cpu_relax(); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9c462fbd60fa..a93934fc93bd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -203,7 +203,6 @@ static int process_one_buffer(struct btrfs_root *log, mutex_lock(&log->fs_info->pinned_mutex); btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - mutex_unlock(&log->fs_info->pinned_mutex); } if (btrfs_buffer_uptodate(eb, gen)) { From 5d13a98f3bf5afc1113f7db184c627a44659bc29 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 13 Mar 2009 11:41:46 -0400 Subject: [PATCH 155/478] Btrfs: readahead checksums during btrfs_finish_ordered_io This reads in blocks in the checksum btree before starting the transaction in btrfs_finish_ordered_io. It makes it much more likely we'll be able to do operations inside the transaction without needing any btree reads, which limits transaction latencies overall. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b83a45dc717e..9b4faac50c18 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1497,6 +1497,30 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, return 0; } +/* + * helper function for btrfs_finish_ordered_io, this + * just reads in some of the csum leaves to prime them into ram + * before we start the transaction. It limits the amount of btree + * reads required while inside the transaction. + */ +static noinline void reada_csum(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_ordered_sum *sum; + u64 bytenr; + + sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, + list); + bytenr = sum->sums[0].bytenr; + + /* + * we don't care about the results, the point of this search is + * just to get the btree leaves into ram + */ + btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -1505,7 +1529,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered_extent; + struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; int compressed = 0; @@ -1528,13 +1552,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, start, 0); + ordered_extent = btrfs_lookup_ordered_extent(inode, + start); + if (!list_empty(&ordered_extent->list)) { + btrfs_release_path(root, path); + reada_csum(root, path, ordered_extent); + } btrfs_free_path(path); } } trans = btrfs_join_transaction(root, 1); - ordered_extent = btrfs_lookup_ordered_extent(inode, start); + if (!ordered_extent) + ordered_extent = btrfs_lookup_ordered_extent(inode, start); BUG_ON(!ordered_extent); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) goto nocow; From a4b6e07d1a8a9b907e82b9acbf51a026fbb9301c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 16 Mar 2009 10:59:57 -0400 Subject: [PATCH 156/478] Btrfs: limit balancing work while flushing delayed refs The delayed reference mechanism is responsible for all updates to the extent allocation trees, including those updates created while processing the delayed references. This commit tries to limit the amount of work that gets created during the final run of delayed refs before a commit. It avoids cowing new blocks unless it is required to finish the commit, and so it avoids new allocations that were not really required. The goal is to avoid infinite loops where we are always making more work on the final run of delayed refs. Over the long term we'll make a special log for the last delayed ref updates as well. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8686a3d2ab3a..dbb724124633 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -949,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; + if (trans->transaction->delayed_refs.flushing && + btrfs_header_nritems(mid) > 2) + return 0; + if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -2159,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; - } else { + } else if (!trans->transaction->delayed_refs.flushing) { ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < @@ -2848,7 +2852,8 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int num_doubles = 0; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && + !trans->transaction->delayed_refs.flushing) { wret = push_leaf_right(trans, root, path, data_size, 0); if (wret < 0) return wret; @@ -3786,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && + !trans->transaction->delayed_refs.flushing) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below From a74ac3220774d33db967088906dc3351829e2d3a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 24 Mar 2009 10:24:13 -0400 Subject: [PATCH 157/478] Btrfs: Make sure i_nlink doesn't hit zero too soon during log replay During log replay, inodes are copied from the log to the main filesystem btrees. Sometimes they have a zero link count in the log but they actually gain links during the replay or have some in the main btree. This patch updates the link count to be at least one after copying the inode out of the log. This makes sure the inode is deleted during an iput while the rest of the replay code is still working on it. The log replay has fixup code to make sure that link counts are correct at the end of the replay, so we could use any non-zero number here and it would work fine. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a93934fc93bd..405439ca4c45 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1532,6 +1532,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); BUG_ON(ret); + + /* if the nlink count is zero here, the iput + * will free the inode. We bump it to make + * sure it doesn't get freed until the link + * count fixup is done + */ + if (inode->i_nlink == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(wc->trans, + root, inode); + } iput(inode); } ret = link_to_fixup_dir(wc->trans, root, From 12fcfd22fe5bf4fe74710232098bc101af497995 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 24 Mar 2009 10:24:20 -0400 Subject: [PATCH 158/478] Btrfs: tree logging unlink/rename fixes The tree logging code allows individual files or directories to be logged without including operations on other files and directories in the FS. It tries to commit the minimal set of changes to disk in order to fsync the single file or directory that was sent to fsync or O_SYNC. The tree logging code was allowing files and directories to be unlinked if they were part of a rename operation where only one directory in the rename was in the fsync log. This patch adds a few new rules to the tree logging. 1) on rename or unlink, if the inode being unlinked isn't in the fsync log, we must force a full commit before doing an fsync of the directory where the unlink was done. The commit isn't done during the unlink, but it is forced the next time we try to log the parent directory. Solution: record transid of last unlink/rename per directory when the directory wasn't already logged. For renames this is only done when renaming to a different directory. mkdir foo/some_dir normal commit rename foo/some_dir foo2/some_dir mkdir foo/some_dir fsync foo/some_dir/some_file The fsync above will unlink the original some_dir without recording it in its new location (foo2). After a crash, some_dir will be gone unless the fsync of some_file forces a full commit 2) we must log any new names for any file or dir that is in the fsync log. This way we make sure not to lose files that are unlinked during the same transaction. 2a) we must log any new names for any file or dir during rename when the directory they are being removed from was logged. 2a is actually the more important variant. Without the extra logging a crash might unlink the old name without recreating the new one 3) after a crash, we must go through any directories with a link count of zero and redo the rm -rf mkdir f1/foo normal commit rm -rf f1/foo fsync(f1) The directory f1 was fully removed from the FS, but fsync was never called on f1, only its parent dir. After a crash the rm -rf must be replayed. This must be able to recurse down the entire directory tree. The inode link count fixup code takes care of the ugly details. Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 13 +- fs/btrfs/ctree.h | 7 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/file.c | 14 +- fs/btrfs/inode.c | 28 ++- fs/btrfs/tree-log.c | 389 ++++++++++++++++++++++++++++++++--------- fs/btrfs/tree-log.h | 17 +- 7 files changed, 372 insertions(+), 98 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 72677ce2b74f..3af4cfb5654c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,12 +86,6 @@ struct btrfs_inode { */ u64 logged_trans; - /* - * trans that last made a change that should be fully fsync'd. This - * gets reset to zero each time the inode is logged - */ - u64 log_dirty_trans; - /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file */ @@ -121,6 +115,13 @@ struct btrfs_inode { /* the start of block group preferred for allocations. */ u64 block_group; + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + struct inode vfs_inode; }; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4ddce91cf3f9..2737facbd341 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -695,7 +695,12 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; - u64 last_trans_new_blockgroup; + + /* + * this is updated to the current trans every time a full commit + * is required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; u64 open_ioctl_trans; unsigned long mount_opt; u64 max_extent; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8933d15a240f..0c482e0d7c43 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_new_blockgroup = trans->transid; + root->fs_info->last_trans_log_full_commit = trans->transid; cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f06c275644b7..32d10a617613 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1173,8 +1173,11 @@ out_nolock: ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { - btrfs_sync_log(trans, root); - btrfs_end_transaction(trans, root); + ret = btrfs_sync_log(trans, root); + if (ret == 0) + btrfs_end_transaction(trans, root); + else + btrfs_commit_transaction(trans, root); } else { btrfs_commit_transaction(trans, root); } @@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ret > 0) { ret = btrfs_commit_transaction(trans, root); } else { - btrfs_sync_log(trans, root); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b4faac50c18..bffd79faffb5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir->i_ino); BUG_ON(ret != 0 && ret != -ENOENT); - if (ret != -ENOENT) - BTRFS_I(dir)->log_dirty_trans = trans->transid; ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + + btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); @@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; - bi->log_dirty_trans = 0; + bi->last_unlink_trans = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); @@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; nr = trans->blocks_used; + + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); btrfs_end_transaction_throttle(trans, root); fail: if (drop_inode) { @@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, trans = btrfs_start_transaction(root, 1); + /* + * this is an ugly little race, but the rename is required to make + * sure that if we crash, the inode is either at the old name + * or the new one. pinning the log transaction lets us make sure + * we don't allow a log commit to come in after we unlink the + * name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + btrfs_set_trans_block_group(trans, new_dir); btrfs_inc_nlink(old_dentry->d_inode); @@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, old_dentry->d_name.name, old_dentry->d_name.len); @@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + btrfs_log_new_name(trans, old_inode, old_dir, + new_dentry->d_parent); out_fail: + + /* this btrfs_end_log_trans just allows the current + * log-sub transaction to complete + */ + btrfs_end_log_trans(root); btrfs_end_transaction_throttle(trans, root); out_unlock: return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 405439ca4c45..1b7f04a8f168 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -34,6 +34,49 @@ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 +/* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + /* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find @@ -47,12 +90,17 @@ #define LOG_WALK_REPLAY_INODES 1 #define LOG_WALK_REPLAY_ALL 2 -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); /* * tree logging is a special write ahead log used to make sure that @@ -132,11 +180,26 @@ static int join_running_log_trans(struct btrfs_root *root) return ret; } +/* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +int btrfs_pin_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + mutex_lock(&root->log_mutex); + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return ret; +} + /* * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -static int end_log_trans(struct btrfs_root *root) +int btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); @@ -602,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = link_to_fixup_dir(trans, root, path, location.objectid); BUG_ON(ret); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); BUG_ON(ret); kfree(name); @@ -803,6 +867,7 @@ conflict_again: victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); @@ -921,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset--; btrfs_release_path(root, path); } - btrfs_free_path(path); + btrfs_release_path(root, path); if (nlink != inode->i_nlink) { inode->i_nlink = nlink; btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; + if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + btrfs_free_path(path); + return 0; } @@ -970,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, iput(inode); - if (key.offset == 0) - break; - key.offset--; + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; } btrfs_release_path(root, path); return 0; @@ -1312,11 +1387,11 @@ again: read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); log_di = NULL; - if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { log_di = btrfs_lookup_dir_item(trans, log, log_path, dir_key->objectid, name, name_len, 0); - } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, @@ -1377,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid) + u64 dirid, int del_all) { u64 range_start; u64 range_end; @@ -1407,10 +1482,14 @@ again: range_start = 0; range_end = 0; while (1) { - ret = find_dir_range(log, path, dirid, key_type, - &range_start, &range_end); - if (ret != 0) - break; + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + } dir_key.offset = range_start; while (1) { @@ -1436,7 +1515,8 @@ again: break; ret = check_item_in_log(trans, root, log, path, - log_path, dir, &found_key); + log_path, dir, + &found_key); BUG_ON(ret); if (found_key.offset == (u64)-1) break; @@ -1513,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid); + root, log, path, key.objectid, 0); BUG_ON(ret); } ret = overwrite_item(wc->trans, root, path, @@ -1850,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_root *root, unsigned long transid) +static int wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -1864,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid) prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->log_transid < transid + 2 && + + if (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); + finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); } while (root->log_transid < transid + 2 && @@ -1874,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid) return 0; } -static int wait_for_writer(struct btrfs_root *root) +static int wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { DEFINE_WAIT(wait); while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (atomic_read(&root->log_writers)) + if (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); @@ -1892,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root) /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -1906,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(root, root->log_transid); + wait_log_commit(trans, root, root->log_transid); mutex_unlock(&root->log_mutex); return 0; } @@ -1914,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(root, root->log_transid - 1); + wait_log_commit(trans, root, root->log_transid - 1); while (1) { unsigned long batch = root->log_batch; mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); - wait_for_writer(root); + + wait_for_writer(trans, root); if (batch == root->log_batch) break; } + /* bail out if we need to do a full commit */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + ret = -EAGAIN; + mutex_unlock(&root->log_mutex); + goto out; + } + ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); @@ -1961,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { - wait_log_commit(log_root_tree, log_root_tree->log_transid); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); - if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) - wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid - 1); + } - wait_for_writer(log_root_tree); + wait_for_writer(trans, log_root_tree); + + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); @@ -1995,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * in and cause problems either. */ write_ctree_super(trans, root->fs_info->tree_root, 2); + ret = 0; +out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) @@ -2008,7 +2124,8 @@ out: return 0; } -/* * free all the extents used by the tree log. This should be called +/* + * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2142,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return 0; } @@ -2169,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return ret; } @@ -2569,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * * This handles both files and directories. */ -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only) { @@ -2595,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans, min_key.offset = 0; max_key.objectid = inode->i_ino; + + /* today the code can only do partial logging of directories */ + if (!S_ISDIR(inode->i_mode)) + inode_only = LOG_INODE_ALL; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* - * if this inode has already been logged and we're in inode_only - * mode, we don't want to delete the things that have already - * been written to the log. - * - * But, if the inode has been through an inode_only log, - * the logged_trans field is not set. This allows us to catch - * any new names for this inode in the backrefs by logging it - * again - */ - if (inode_only == LOG_INODE_EXISTS && - BTRFS_I(inode)->logged_trans == trans->transid) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - goto out; - } mutex_lock(&BTRFS_I(inode)->log_mutex); /* @@ -2703,7 +2809,6 @@ next_slot: if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(root, path); btrfs_release_path(log, dst_path); - BTRFS_I(inode)->log_dirty_trans = 0; ret = log_directory_changes(trans, root, inode, path, dst_path); BUG_ON(ret); } @@ -2712,19 +2817,58 @@ next_slot: btrfs_free_path(path); btrfs_free_path(dst_path); -out: return 0; } -int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) +/* + * follow the dentry parent pointers up the chain and see if any + * of the directories in it require a full commit before they can + * be logged. Returns zero if nothing special needs to be done or 1 if + * a full commit is required. + */ +static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + struct inode *inode, + struct dentry *parent, + struct super_block *sb, + u64 last_committed) { - int ret; + int ret = 0; + struct btrfs_root *root; - start_log_trans(trans, root); - ret = __btrfs_log_inode(trans, root, inode, inode_only); - end_log_trans(root); + if (!S_ISDIR(inode->i_mode)) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + goto out; + inode = parent->d_inode; + } + + while (1) { + BTRFS_I(inode)->logged_trans = trans->transid; + smp_mb(); + + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + root = BTRFS_I(inode)->root; + + /* + * make sure any commits to the log are forced + * to be full commits + */ + root->fs_info->last_trans_log_full_commit = + trans->transid; + ret = 1; + break; + } + + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + if (parent == sb->s_root) + break; + + parent = parent->d_parent; + inode = parent->d_inode; + + } +out: return ret; } @@ -2734,31 +2878,53 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans, * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_dentry(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) { - int inode_only = LOG_INODE_ALL; + int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; - int ret; + int ret = 0; + u64 last_committed = root->fs_info->last_trans_committed; + + sb = inode->i_sb; + + if (root->fs_info->last_trans_log_full_commit > + root->fs_info->last_trans_committed) { + ret = 1; + goto end_no_trans; + } + + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; start_log_trans(trans, root); - sb = dentry->d_inode->i_sb; + + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + inode_only = LOG_INODE_EXISTS; + while (1) { - ret = __btrfs_log_inode(trans, root, dentry->d_inode, - inode_only); - BUG_ON(ret); - inode_only = LOG_INODE_EXISTS; - - dentry = dentry->d_parent; - if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; - if (BTRFS_I(dentry->d_inode)->generation <= - root->fs_info->last_trans_committed) + inode = parent->d_inode; + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + } + if (parent == sb->s_root) break; + + parent = parent->d_parent; } - end_log_trans(root); - return 0; + ret = 0; + btrfs_end_log_trans(root); +end_no_trans: + return ret; } /* @@ -2770,12 +2936,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans, int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - u64 gen; - gen = root->fs_info->last_trans_new_blockgroup; - if (gen > root->fs_info->last_trans_committed) - return 1; - else - return btrfs_log_dentry(trans, root, dentry); + return btrfs_log_inode_parent(trans, root, dentry->d_inode, + dentry->d_parent, 0); } /* @@ -2894,3 +3056,74 @@ again: kfree(log_root_tree); return 0; } + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename) +{ + /* + * if this directory was already logged any new + * names for this file/dir will get recorded + */ + smp_mb(); + if (BTRFS_I(dir)->logged_trans == trans->transid) + return; + + /* + * if the inode we're about to unlink was logged, + * the log will be properly updated for any new names + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + if (for_rename) + goto record; + + /* we can safely do the unlink without any special recording */ + return; + +record: + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. + * + * It will return zero if all goes well, and it will return 1 if a + * full transaction commit is required. + */ +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent) +{ + struct btrfs_root * root = BTRFS_I(inode)->root; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + if (BTRFS_I(inode)->logged_trans <= + root->fs_info->last_trans_committed && + (!old_dir || BTRFS_I(old_dir)->logged_trans <= + root->fs_info->last_trans_committed)) + return 0; + + return btrfs_log_inode_parent(trans, root, inode, parent, 1); +} + diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index b9409b32ed02..d09c7609e16b 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -22,14 +22,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_log_dentry(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry); -int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid); +int btrfs_join_running_log_trans(struct btrfs_root *root); +int btrfs_end_log_trans(struct btrfs_root *root); +int btrfs_pin_log_trans(struct btrfs_root *root); +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only); +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename); +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent); #endif From af4176b49c5ee15a9c9b10720c92456b28e7aac7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 24 Mar 2009 10:24:31 -0400 Subject: [PATCH 159/478] Btrfs: optimize fsyncs on old files The fsync log has code to make sure all of the parents of a file are in the log along with the file. It uses a minimal log of the parent directory inodes, just enough to get the parent directory on disk. If the transaction that originally created a file is fully on disk, and the file hasn't been renamed or linked into other directories, we can safely skip the parent directory walk. We know the file is on disk somewhere and we can go ahead and just log that single file. This is more important now because unrelated unlinks in the parent directory might make us force a commit if we try to log the parent. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1b7f04a8f168..fc9b87a7975b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2835,6 +2835,17 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_root *root; + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto out; + if (!S_ISDIR(inode->i_mode)) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) goto out; @@ -2904,8 +2915,19 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, inode, inode_only); BUG_ON(ret); - inode_only = LOG_INODE_EXISTS; + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto no_parent; + + inode_only = LOG_INODE_EXISTS; while (1) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; @@ -2921,6 +2943,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, parent = parent->d_parent; } +no_parent: ret = 0; btrfs_end_log_trans(root); end_no_trans: @@ -3069,6 +3092,19 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, int for_rename) { + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + /* * if this directory was already logged any new * names for this file/dir will get recorded @@ -3114,6 +3150,13 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, { struct btrfs_root * root = BTRFS_I(inode)->root; + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + /* * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it From 1a81af4d1d9c60d4313309f937a1fc5567205a87 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Mar 2009 09:55:11 -0400 Subject: [PATCH 160/478] Btrfs: make sure btrfs_update_delayed_ref doesn't increase ref_mod btrfs_update_delayed_ref is optimized to add and remove different references in one pass through the delayed ref tree. It is a zero sum on the total number of refs on a given extent. But, the code was recording an extra ref in the head node. This never made it down to the disk but was used when deciding if it was safe to free the extent while dropping snapshots. The fix used here is to make sure the ref_mod count is unchanged on the head ref when btrfs_update_delayed_ref is called. Signed-off-by: Chris Mason --- fs/btrfs/delayed-ref.c | 10 +++++++--- fs/btrfs/delayed-ref.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 759fa247ced8..cbf7dc8ae3ec 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -450,8 +450,12 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ - if (parent == (u64)-1 && action == BTRFS_DROP_DELAYED_REF) - count_mod = -1; + if (parent == (u64)-1) { + if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + else if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + } /* * BTRFS_ADD_DELAYED_EXTENT means that we need to update @@ -647,7 +651,7 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, */ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, (u64)-1, 0, 0, 0, - BTRFS_ADD_DELAYED_REF, 0); + BTRFS_UPDATE_DELAYED_HEAD, 0); BUG_ON(ret); ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 57153fcc347b..3bec2ff0b15c 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -22,6 +22,7 @@ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ +#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ struct btrfs_delayed_ref_node { struct rb_node rb_node; From ceb93a9ff16612314d757874b6415ffbb2091576 Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Wed, 25 Mar 2009 08:54:29 -0700 Subject: [PATCH 161/478] powerpc/PCI: include pci.h in powerpc MSI implementation This file uses PCI MSI defines and so needs pci.h. Tested-by: Stephen Rothwell Signed-off-by: Jesse Barnes --- arch/powerpc/kernel/msi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c index 0c16e2a854e5..8bbc12d20f5c 100644 --- a/arch/powerpc/kernel/msi.c +++ b/arch/powerpc/kernel/msi.c @@ -9,6 +9,7 @@ #include #include +#include #include From 7058548cd50e5bda8db086bb2e5c1d82f746d047 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 25 Mar 2009 23:35:46 -0400 Subject: [PATCH 162/478] ext4: Use WRITE_SYNC for commits which are caused by fsync() If a commit is triggered by fsync(), set a flag indicating the journal blocks associated with the transaction should be flushed out using WRITE_SYNC. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 5 ++++- fs/jbd2/transaction.c | 2 ++ include/linux/jbd2.h | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 62804e57a44c..4ea72377c7a2 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + if (commit_transaction->t_synchronous_commit) + write_op = WRITE_SYNC; stats.u.run.rs_wait = commit_transaction->t_max_wait; stats.u.run.rs_locked = jiffies; stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, @@ -680,7 +683,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE, bh); + submit_bh(write_op, bh); } cond_resched(); stats.u.run.rs_blocks_logged += bufs; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 28ce21d8598e..996ffda06bf3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle) } } + if (handle->h_sync) + transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 4d248b3f1323..8815a3456b3b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -648,6 +648,12 @@ struct transaction_s */ int t_handle_count; + /* + * This transaction is being forced and some process is + * waiting for it to finish. + */ + int t_synchronous_commit:1; + /* * For use by the filesystem to store fs-specific data * structures associated with the transaction From 563bdd61fe4dbd6b58cf7eb06f8d8f14479ae1dc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 26 Mar 2009 00:06:19 -0400 Subject: [PATCH 163/478] ext4: Check for an valid i_mode when reading the inode from disk Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bed4a0abd0d1..c7d9250fbc3f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4394,7 +4394,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } - } else { + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, @@ -4402,6 +4403,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else { + brelse(bh); + ret = -EIO; + ext4_error(inode->i_sb, __func__, + "bogus i_mode (%o) for inode=%lu", + inode->i_mode, inode->i_ino); + goto bad_inode; } brelse(iloc.bh); ext4_set_inode_flags(inode); From e8c3b42057be44cff9dd225bd9930956c5f34776 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:32 +0900 Subject: [PATCH 164/478] ia64/pv_ops/xen: use __initconst instead of __initdata for const data use __initconst instead of __initdata for const data like ec8148de85a73a3be397a59b6d8f4f32cf2dd254 Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/xen/xen_pv_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c index 936cff3c96e0..fa3b967e69cb 100644 --- a/arch/ia64/xen/xen_pv_ops.c +++ b/arch/ia64/xen/xen_pv_ops.c @@ -260,7 +260,7 @@ xen_intrin_local_irq_restore(unsigned long mask) xen_rsm_i(); } -static const struct pv_cpu_ops xen_cpu_ops __initdata = { +static const struct pv_cpu_ops xen_cpu_ops __initconst = { .fc = xen_fc, .thash = xen_thash, .get_cpuid = xen_get_cpuid, From ac93925acbf841d70a95ab576b76b15a34d194eb Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:33 +0900 Subject: [PATCH 165/478] ia64/xen: short-circuit tests for dom0 This patch is ia64 counter part of clean up of the xen predicates. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/xen/hypervisor.h | 39 ++++++++++++-------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h index 7a804e80fc67..e425227a418e 100644 --- a/arch/ia64/include/asm/xen/hypervisor.h +++ b/arch/ia64/include/asm/xen/hypervisor.h @@ -33,9 +33,6 @@ #ifndef _ASM_IA64_XEN_HYPERVISOR_H #define _ASM_IA64_XEN_HYPERVISOR_H -#ifdef CONFIG_XEN - -#include #include #include /* to compile feature.c */ #include /* to comiple xen-netfront.c */ @@ -43,22 +40,32 @@ /* xen_domain_type is set before executing any C code by early_xen_setup */ enum xen_domain_type { - XEN_NATIVE, - XEN_PV_DOMAIN, - XEN_HVM_DOMAIN, + XEN_NATIVE, /* running on bare hardware */ + XEN_PV_DOMAIN, /* running in a PV domain */ + XEN_HVM_DOMAIN, /* running in a Xen hvm domain*/ }; +#ifdef CONFIG_XEN extern enum xen_domain_type xen_domain_type; +#else +#define xen_domain_type XEN_NATIVE +#endif #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) -#define xen_initial_domain() (xen_pv_domain() && \ +#define xen_pv_domain() (xen_domain() && \ + xen_domain_type == XEN_PV_DOMAIN) +#define xen_hvm_domain() (xen_domain() && \ + xen_domain_type == XEN_HVM_DOMAIN) + +#ifdef CONFIG_XEN_DOM0 +#define xen_initial_domain() (xen_pv_domain() && \ (xen_start_info->flags & SIF_INITDOMAIN)) -#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) +#else +#define xen_initial_domain() (0) +#endif -/* deprecated. remove this */ -#define is_running_on_xen() (xen_domain_type == XEN_PV_DOMAIN) +#ifdef CONFIG_XEN extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; @@ -74,16 +81,6 @@ void force_evtchn_callback(void); /* For setup_arch() in arch/ia64/kernel/setup.c */ void xen_ia64_enable_opt_feature(void); - -#else /* CONFIG_XEN */ - -#define xen_domain() (0) -#define xen_pv_domain() (0) -#define xen_initial_domain() (0) -#define xen_hvm_domain() (0) -#define is_running_on_xen() (0) /* deprecated. remove this */ #endif -#define is_initial_xendomain() (0) /* deprecated. remove this */ - #endif /* _ASM_IA64_XEN_HYPERVISOR_H */ From dd97d5cb540939602cba9af6f88e883a6fe451f0 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:34 +0900 Subject: [PATCH 166/478] ia64/pv_ops: add hooks to paravirtualize fsyscall implementation. Add two hooks, paravirt_get_fsyscall_table() and paravirt_get_fsys_bubble_doen() to paravirtualize fsyscall implementation. This patch just add the hooks fsyscall and don't paravirtualize it. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/native/inst.h | 3 +++ arch/ia64/include/asm/paravirt.h | 15 +++++++++++++++ arch/ia64/kernel/Makefile | 4 ++-- arch/ia64/kernel/fsys.S | 17 +++++++++-------- arch/ia64/kernel/patch.c | 26 +++++++++++++++++++++++--- arch/ia64/mm/init.c | 3 ++- 6 files changed, 54 insertions(+), 14 deletions(-) diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h index 0a1026cca4fa..5e4e15133f41 100644 --- a/arch/ia64/include/asm/native/inst.h +++ b/arch/ia64/include/asm/native/inst.h @@ -30,6 +30,9 @@ #define __paravirt_work_processed_syscall_target \ ia64_work_processed_syscall +#define paravirt_fsyscall_table ia64_native_fsyscall_table +#define paravirt_fsys_bubble_down ia64_native_fsys_bubble_down + #ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK # define PARAVIRT_POISON 0xdeadbeefbaadf00d # define CLOBBER(clob) \ diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 2bf3636473fe..56f69f938cca 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -22,6 +22,21 @@ #ifndef __ASM_PARAVIRT_H #define __ASM_PARAVIRT_H +#ifndef __ASSEMBLY__ +/****************************************************************************** + * fsys related addresses + */ +struct pv_fsys_data { + unsigned long *fsyscall_table; + void *fsys_bubble_down; +}; + +extern struct pv_fsys_data pv_fsys_data; + +unsigned long *paravirt_get_fsyscall_table(void); +char *paravirt_get_fsys_bubble_down(void); +#endif + #ifdef CONFIG_PARAVIRT_GUEST #define PARAVIRT_HYPERVISOR_TYPE_DEFAULT 0 diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c381ea954892..1ab150ec8cea 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -111,9 +111,9 @@ include/asm-ia64/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s clean-files += $(objtree)/include/asm-ia64/nr-irqs.h # -# native ivt.S and entry.S +# native ivt.S, entry.S and fsys.S # -ASM_PARAVIRT_OBJS = ivt.o entry.o +ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o define paravirtualized_native AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index c1625c7e1779..788319f121ab 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -25,6 +25,7 @@ #include #include "entry.h" +#include "paravirt_inst.h" /* * See Documentation/ia64/fsys.txt for details on fsyscalls. @@ -602,7 +603,7 @@ ENTRY(fsys_fallback_syscall) mov r26=ar.pfs END(fsys_fallback_syscall) /* FALL THROUGH */ -GLOBAL_ENTRY(fsys_bubble_down) +GLOBAL_ENTRY(paravirt_fsys_bubble_down) .prologue .altrp b6 .body @@ -640,7 +641,7 @@ GLOBAL_ENTRY(fsys_bubble_down) * * PSR.BE : already is turned off in __kernel_syscall_via_epc() * PSR.AC : don't care (kernel normally turns PSR.AC on) - * PSR.I : already turned off by the time fsys_bubble_down gets + * PSR.I : already turned off by the time paravirt_fsys_bubble_down gets * invoked * PSR.DFL: always 0 (kernel never turns it on) * PSR.DFH: don't care --- kernel never touches f32-f127 on its own @@ -650,7 +651,7 @@ GLOBAL_ENTRY(fsys_bubble_down) * PSR.DB : don't care --- kernel never enables kernel-level * breakpoints * PSR.TB : must be 0 already; if it wasn't zero on entry to - * __kernel_syscall_via_epc, the branch to fsys_bubble_down + * __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down * will trigger a taken branch; the taken-trap-handler then * converts the syscall into a break-based system-call. */ @@ -741,14 +742,14 @@ GLOBAL_ENTRY(fsys_bubble_down) nop.m 0 (p8) br.call.sptk.many b6=b6 // B (ignore return address) br.cond.spnt ia64_trace_syscall // B -END(fsys_bubble_down) +END(paravirt_fsys_bubble_down) .rodata .align 8 - .globl fsyscall_table + .globl paravirt_fsyscall_table - data8 fsys_bubble_down -fsyscall_table: + data8 paravirt_fsys_bubble_down +paravirt_fsyscall_table: data8 fsys_ni_syscall data8 0 // exit // 1025 data8 0 // read @@ -1033,4 +1034,4 @@ fsyscall_table: // fill in zeros for the remaining entries .zero: - .space fsyscall_table + 8*NR_syscalls - .zero, 0 + .space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0 diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c index b83b2c516008..02dd977436fc 100644 --- a/arch/ia64/kernel/patch.c +++ b/arch/ia64/kernel/patch.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -169,16 +170,35 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) ia64_srlz_i(); } +extern unsigned long ia64_native_fsyscall_table[NR_syscalls]; +extern char ia64_native_fsys_bubble_down[]; +struct pv_fsys_data pv_fsys_data __initdata = { + .fsyscall_table = (unsigned long *)ia64_native_fsyscall_table, + .fsys_bubble_down = (void *)ia64_native_fsys_bubble_down, +}; + +unsigned long * __init +paravirt_get_fsyscall_table(void) +{ + return pv_fsys_data.fsyscall_table; +} + +char * __init +paravirt_get_fsys_bubble_down(void) +{ + return pv_fsys_data.fsys_bubble_down; +} + static void __init patch_fsyscall_table (unsigned long start, unsigned long end) { - extern unsigned long fsyscall_table[NR_syscalls]; + u64 fsyscall_table = (u64)paravirt_get_fsyscall_table(); s32 *offp = (s32 *) start; u64 ip; while (offp < (s32 *) end) { ip = (u64) ia64_imva((char *) offp + *offp); - ia64_patch_imm64(ip, (u64) fsyscall_table); + ia64_patch_imm64(ip, fsyscall_table); ia64_fc((void *) ip); ++offp; } @@ -189,7 +209,7 @@ patch_fsyscall_table (unsigned long start, unsigned long end) static void __init patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) { - extern char fsys_bubble_down[]; + u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down(); s32 *offp = (s32 *) start; u64 ip; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 56e12903973c..c9bc5b305ffa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -35,6 +35,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -667,8 +668,8 @@ mem_init (void) * code can tell them apart. */ for (i = 0; i < NR_syscalls; ++i) { - extern unsigned long fsyscall_table[NR_syscalls]; extern unsigned long sys_call_table[NR_syscalls]; + unsigned long *fsyscall_table = paravirt_get_fsyscall_table(); if (!fsyscall_table[i] || nolwsys) fsyscall_table[i] = sys_call_table[i] | 1; From 533bd156231eec4b399c36579e7c30b6f52cfd29 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:35 +0900 Subject: [PATCH 167/478] ia64/pv_ops/xen: preliminary to paravirtualizing fsys.S for xen. This is a preliminary patch to paravirtualizing fsys.S. compile fsys.S twice one for native and one for xen, and switch them at run tine. Later fsys.S will be paravirtualized. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/xen/inst.h | 3 +++ arch/ia64/xen/Makefile | 2 +- arch/ia64/xen/xen_pv_ops.c | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h index 19c2ae1d878a..e8e01b28d2ae 100644 --- a/arch/ia64/include/asm/xen/inst.h +++ b/arch/ia64/include/asm/xen/inst.h @@ -33,6 +33,9 @@ #define __paravirt_work_processed_syscall_target \ xen_work_processed_syscall +#define paravirt_fsyscall_table xen_fsyscall_table +#define paravirt_fsys_bubble_down xen_fsys_bubble_down + #define MOV_FROM_IFA(reg) \ movl reg = XSI_IFA; \ ;; \ diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile index 0ad0224693d9..b4ca2e6c0ead 100644 --- a/arch/ia64/xen/Makefile +++ b/arch/ia64/xen/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_IA64_GENERIC) += machvec.o AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN # xen multi compile -ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S +ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S fsys.S ASM_PARAVIRT_OBJS = $(addprefix xen-,$(ASM_PARAVIRT_MULTI_COMPILE_SRCS:.S=.o)) obj-y += $(ASM_PARAVIRT_OBJS) define paravirtualized_xen diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c index fa3b967e69cb..fe72308321b6 100644 --- a/arch/ia64/xen/xen_pv_ops.c +++ b/arch/ia64/xen/xen_pv_ops.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -165,6 +166,18 @@ static const struct pv_init_ops xen_init_ops __initconst = { .post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu, }; +/*************************************************************************** + * pv_fsys_data + * addresses for fsys + */ + +extern unsigned long xen_fsyscall_table[NR_syscalls]; +extern char xen_fsys_bubble_down[]; +struct pv_fsys_data xen_fsys_data __initdata = { + .fsyscall_table = (unsigned long *)xen_fsyscall_table, + .fsys_bubble_down = (void *)xen_fsys_bubble_down, +}; + /*************************************************************************** * pv_cpu_ops * intrinsics hooks. @@ -355,6 +368,7 @@ xen_setup_pv_ops(void) xen_info_init(); pv_info = xen_info; pv_init_ops = xen_init_ops; + pv_fsys_data = xen_fsys_data; pv_cpu_ops = xen_cpu_ops; pv_iosapic_ops = xen_iosapic_ops; pv_irq_ops = xen_irq_ops; From 84b8857a038c060535dafdc8732a1ed60d0e98fc Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:36 +0900 Subject: [PATCH 168/478] ia64/pv_ops: paravirtualize fsys.S. paravirtualize fsys.S. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/kernel/fsys.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 788319f121ab..3544d75e7cbd 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -419,7 +419,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1)) ;; - rsm psr.i // mask interrupt delivery + RSM_PSR_I(p0, r18, r19) // mask interrupt delivery mov ar.ccv=0 andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP @@ -492,7 +492,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set #ifdef CONFIG_SMP st4.rel [r31]=r0 // release the lock #endif - ssm psr.i + SSM_PSR_I(p0, p9, r31) ;; srlz.d // ensure psr.i is set again @@ -514,7 +514,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3) #ifdef CONFIG_SMP st4.rel [r31]=r0 // release the lock #endif - ssm psr.i + SSM_PSR_I(p0, p9, r17) ;; srlz.d br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall @@ -522,7 +522,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3) #ifdef CONFIG_SMP .lock_contention: /* Rather than spinning here, fall back on doing a heavy-weight syscall. */ - ssm psr.i + SSM_PSR_I(p0, p9, r17) ;; srlz.d br.sptk.many fsys_fallback_syscall @@ -593,11 +593,11 @@ ENTRY(fsys_fallback_syscall) adds r17=-1024,r15 movl r14=sys_call_table ;; - rsm psr.i + RSM_PSR_I(p0, r26, r27) shladd r18=r17,3,r14 ;; ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point - mov r29=psr // read psr (12 cyc load latency) + MOV_FROM_PSR(p0, r29, r26) // read psr (12 cyc load latency) mov r27=ar.rsc mov r21=ar.fpsr mov r26=ar.pfs @@ -735,7 +735,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down) mov rp=r14 // I0 set the real return addr and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A ;; - ssm psr.i // M2 we're on kernel stacks now, reenable irqs + SSM_PSR_I(p0, p6, r22) // M2 we're on kernel stacks now, reenable irqs cmp.eq p8,p0=r3,r0 // A (p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT From 9d1964f25c3c43dab5a5f4761477bcfc8402250e Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:37 +0900 Subject: [PATCH 169/478] ia64/pv_ops/pvchecker: support mov = ar.itc paravirtualization add suport for mov = ar.itc to pvchecker. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/native/pvchk_inst.h | 5 +++++ arch/ia64/scripts/pvcheck.sed | 1 + 2 files changed, 6 insertions(+) diff --git a/arch/ia64/include/asm/native/pvchk_inst.h b/arch/ia64/include/asm/native/pvchk_inst.h index b8e6eb1090d7..13b289ea6173 100644 --- a/arch/ia64/include/asm/native/pvchk_inst.h +++ b/arch/ia64/include/asm/native/pvchk_inst.h @@ -180,6 +180,11 @@ IS_PRED_IN(pred) \ IS_RREG_OUT(reg) \ IS_RREG_CLOB(clob) +#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \ + IS_PRED_IN(pred) \ + IS_PRED_CLOB(pred_clob) \ + IS_RREG_OUT(reg) \ + IS_RREG_CLOB(clob) #define MOV_TO_IFA(reg, clob) \ IS_RREG_IN(reg) \ IS_RREG_CLOB(clob) diff --git a/arch/ia64/scripts/pvcheck.sed b/arch/ia64/scripts/pvcheck.sed index ba66ac2e4c60..e59809a3fc01 100644 --- a/arch/ia64/scripts/pvcheck.sed +++ b/arch/ia64/scripts/pvcheck.sed @@ -17,6 +17,7 @@ s/mov.*=.*cr\.iip/.warning \"cr.iip should not used directly\"/g s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g # avoid ar.fpsr s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g +s/mov.*=.*ar\.itc.*/.warning \"ar.itc should not used directly\"/g s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g From 94752a794ddfdef65289a16627faefa7e2e62d58 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:38 +0900 Subject: [PATCH 170/478] ia64/pv_ops: paravirtualize mov = ar.itc. paravirtualize mov reg = ar.itc. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/native/inst.h | 5 +++++ arch/ia64/kernel/entry.S | 4 ++-- arch/ia64/kernel/fsys.S | 4 ++-- arch/ia64/kernel/ivt.S | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h index 5e4e15133f41..ad59fc6a6134 100644 --- a/arch/ia64/include/asm/native/inst.h +++ b/arch/ia64/include/asm/native/inst.h @@ -77,6 +77,11 @@ (pred) mov reg = psr \ CLOBBER(clob) +#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \ +(pred) mov reg = ar.itc \ + CLOBBER(clob) \ + CLOBBER_PRED(pred_clob) + #define MOV_TO_IFA(reg, clob) \ mov cr.ifa = reg \ CLOBBER(clob) diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index e5341e2c1175..ccfdeee9d89f 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -735,7 +735,7 @@ GLOBAL_ENTRY(__paravirt_leave_syscall) __paravirt_work_processed_syscall: #ifdef CONFIG_VIRT_CPU_ACCOUNTING adds r2=PT(LOADRS)+16,r12 -(pUStk) mov.m r22=ar.itc // fetch time at leave + MOV_FROM_ITC(pUStk, p9, r22, r19) // fetch time at leave adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 ;; (p6) ld4 r31=[r18] // load current_thread_info()->flags @@ -984,7 +984,7 @@ GLOBAL_ENTRY(__paravirt_leave_kernel) #ifdef CONFIG_VIRT_CPU_ACCOUNTING .pred.rel.mutex pUStk,pKStk MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled -(pUStk) mov.m r22=ar.itc // M fetch time at leave + MOV_FROM_ITC(pUStk, p9, r22, r29) // M fetch time at leave nop.i 0 ;; #else diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 3544d75e7cbd..3567d54f8cee 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -280,7 +280,7 @@ ENTRY(fsys_gettimeofday) (p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control ;; .pred.rel.mutex p8,p9 -(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! + MOV_FROM_ITC(p8, p6, r2, r10) // CPU_TIMER. 36 clocks latency!!! (p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. (p13) ld8 r25 = [r19] // get itc_lastcycle value ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec @@ -684,7 +684,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down) ;; mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0 #ifdef CONFIG_VIRT_CPU_ACCOUNTING - mov.m r30=ar.itc // M get cycle for accounting + MOV_FROM_ITC(p0, p6, r30, r23) // M get cycle for accounting #else nop.m 0 #endif diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S index f675d8e33853..ec9a5fdfa1b9 100644 --- a/arch/ia64/kernel/ivt.S +++ b/arch/ia64/kernel/ivt.S @@ -804,7 +804,7 @@ ENTRY(break_fault) /////////////////////////////////////////////////////////////////////// st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag #ifdef CONFIG_VIRT_CPU_ACCOUNTING - mov.m r30=ar.itc // M get cycle for accounting + MOV_FROM_ITC(p0, p14, r30, r18) // M get cycle for accounting #else mov b6=r30 // I0 setup syscall handler branch reg early #endif From 496203b15b7249599712525c2b6aafe231b4628d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:39 +0900 Subject: [PATCH 171/478] ia64/pv_ops/xen: paravirtualize read/write ar.itc and ar.itm paravirtualize ar.itc and ar.itm in order to support save/restore. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/xen/inst.h | 21 +++++++ arch/ia64/include/asm/xen/interface.h | 9 +++ arch/ia64/include/asm/xen/minstate.h | 11 +++- arch/ia64/include/asm/xen/privop.h | 2 + arch/ia64/kernel/asm-offsets.c | 2 + arch/ia64/xen/xen_pv_ops.c | 80 ++++++++++++++++++++++++++- 6 files changed, 123 insertions(+), 2 deletions(-) diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h index e8e01b28d2ae..90537dc15efe 100644 --- a/arch/ia64/include/asm/xen/inst.h +++ b/arch/ia64/include/asm/xen/inst.h @@ -113,6 +113,27 @@ .endm #define MOV_FROM_PSR(pred, reg, clob) __MOV_FROM_PSR pred, reg, clob +/* assuming ar.itc is read with interrupt disabled. */ +#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \ +(pred) movl clob = XSI_ITC_OFFSET; \ + ;; \ +(pred) ld8 clob = [clob]; \ +(pred) mov reg = ar.itc; \ + ;; \ +(pred) add reg = reg, clob; \ + ;; \ +(pred) movl clob = XSI_ITC_LAST; \ + ;; \ +(pred) ld8 clob = [clob]; \ + ;; \ +(pred) cmp.geu.unc pred_clob, p0 = clob, reg; \ + ;; \ +(pred_clob) add reg = 1, clob; \ + ;; \ +(pred) movl clob = XSI_ITC_LAST; \ + ;; \ +(pred) st8 [clob] = reg + #define MOV_TO_IFA(reg, clob) \ movl clob = XSI_IFA; \ diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h index f00fab40854d..e951e740bdf2 100644 --- a/arch/ia64/include/asm/xen/interface.h +++ b/arch/ia64/include/asm/xen/interface.h @@ -209,6 +209,15 @@ struct mapped_regs { unsigned long krs[8]; /* kernel registers */ unsigned long tmp[16]; /* temp registers (e.g. for hyperprivops) */ + + /* itc paravirtualization + * vAR.ITC = mAR.ITC + itc_offset + * itc_last is one which was lastly passed to + * the guest OS in order to prevent it from + * going backwords. + */ + unsigned long itc_offset; + unsigned long itc_last; }; }; }; diff --git a/arch/ia64/include/asm/xen/minstate.h b/arch/ia64/include/asm/xen/minstate.h index 4d92d9bbda7b..c57fa910f2c9 100644 --- a/arch/ia64/include/asm/xen/minstate.h +++ b/arch/ia64/include/asm/xen/minstate.h @@ -1,3 +1,12 @@ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +/* read ar.itc in advance, and use it before leaving bank 0 */ +#define XEN_ACCOUNT_GET_STAMP \ + MOV_FROM_ITC(pUStk, p6, r20, r2); +#else +#define XEN_ACCOUNT_GET_STAMP +#endif + /* * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves * the minimum state necessary that allows us to turn psr.ic back @@ -123,7 +132,7 @@ ;; \ .mem.offset 0,0; st8.spill [r16]=r2,16; \ .mem.offset 8,0; st8.spill [r17]=r3,16; \ - ACCOUNT_GET_STAMP \ + XEN_ACCOUNT_GET_STAMP \ adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ ;; \ EXTRA; \ diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h index 71ec7546e100..2261dda578ff 100644 --- a/arch/ia64/include/asm/xen/privop.h +++ b/arch/ia64/include/asm/xen/privop.h @@ -55,6 +55,8 @@ #define XSI_BANK1_R16 (XSI_BASE + XSI_BANK1_R16_OFS) #define XSI_BANKNUM (XSI_BASE + XSI_BANKNUM_OFS) #define XSI_IHA (XSI_BASE + XSI_IHA_OFS) +#define XSI_ITC_OFFSET (XSI_BASE + XSI_ITC_OFFSET_OFS) +#define XSI_ITC_LAST (XSI_BASE + XSI_ITC_LAST_OFS) #endif #ifndef __ASSEMBLY__ diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index 742dbb1d5a4f..af5650169043 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -316,5 +316,7 @@ void foo(void) DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]); DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat); DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat); + DEFINE_MAPPED_REG_OFS(XSI_ITC_OFFSET_OFS, itc_offset); + DEFINE_MAPPED_REG_OFS(XSI_ITC_LAST_OFS, itc_last); #endif /* CONFIG_XEN */ } diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c index fe72308321b6..d91336114344 100644 --- a/arch/ia64/xen/xen_pv_ops.c +++ b/arch/ia64/xen/xen_pv_ops.c @@ -183,6 +183,75 @@ struct pv_fsys_data xen_fsys_data __initdata = { * intrinsics hooks. */ +static void +xen_set_itm_with_offset(unsigned long val) +{ + /* ia64_cpu_local_tick() calls this with interrupt enabled. */ + /* WARN_ON(!irqs_disabled()); */ + xen_set_itm(val - XEN_MAPPEDREGS->itc_offset); +} + +static unsigned long +xen_get_itm_with_offset(void) +{ + /* unused at this moment */ + printk(KERN_DEBUG "%s is called.\n", __func__); + + WARN_ON(!irqs_disabled()); + return ia64_native_getreg(_IA64_REG_CR_ITM) + + XEN_MAPPEDREGS->itc_offset; +} + +/* ia64_set_itc() is only called by + * cpu_init() with ia64_set_itc(0) and ia64_sync_itc(). + * So XEN_MAPPEDRESG->itc_offset cal be considered as almost constant. + */ +static void +xen_set_itc(unsigned long val) +{ + unsigned long mitc; + + WARN_ON(!irqs_disabled()); + mitc = ia64_native_getreg(_IA64_REG_AR_ITC); + XEN_MAPPEDREGS->itc_offset = val - mitc; + XEN_MAPPEDREGS->itc_last = val; +} + +static unsigned long +xen_get_itc(void) +{ + unsigned long res; + unsigned long itc_offset; + unsigned long itc_last; + unsigned long ret_itc_last; + + itc_offset = XEN_MAPPEDREGS->itc_offset; + do { + itc_last = XEN_MAPPEDREGS->itc_last; + res = ia64_native_getreg(_IA64_REG_AR_ITC); + res += itc_offset; + if (itc_last >= res) + res = itc_last + 1; + ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last, + itc_last, res); + } while (unlikely(ret_itc_last != itc_last)); + return res; + +#if 0 + /* ia64_itc_udelay() calls ia64_get_itc() with interrupt enabled. + Should it be paravirtualized instead? */ + WARN_ON(!irqs_disabled()); + itc_offset = XEN_MAPPEDREGS->itc_offset; + itc_last = XEN_MAPPEDREGS->itc_last; + res = ia64_native_getreg(_IA64_REG_AR_ITC); + res += itc_offset; + if (itc_last >= res) + res = itc_last + 1; + XEN_MAPPEDREGS->itc_last = res; + return res; +#endif +} + static void xen_setreg(int regnum, unsigned long val) { switch (regnum) { @@ -194,11 +263,14 @@ static void xen_setreg(int regnum, unsigned long val) xen_set_eflag(val); break; #endif + case _IA64_REG_AR_ITC: + xen_set_itc(val); + break; case _IA64_REG_CR_TPR: xen_set_tpr(val); break; case _IA64_REG_CR_ITM: - xen_set_itm(val); + xen_set_itm_with_offset(val); break; case _IA64_REG_CR_EOI: xen_eoi(val); @@ -222,6 +294,12 @@ static unsigned long xen_getreg(int regnum) res = xen_get_eflag(); break; #endif + case _IA64_REG_AR_ITC: + res = xen_get_itc(); + break; + case _IA64_REG_CR_ITM: + res = xen_get_itm_with_offset(); + break; case _IA64_REG_CR_IVR: res = xen_get_ivr(); break; From f927da178671a824cf6c530f0623544206387e57 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:40 +0900 Subject: [PATCH 172/478] ia64/pv_ops/pv_time_ops: add sched_clock hook. add sched_clock() hook to paravirtualize sched_clock(). ia64 sched_clock() is based on ar.itc which isn't stable on virtualized environment because vcpu may move around on pcpus. So it needs paravirtualization. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/paravirt.h | 7 +++++++ arch/ia64/include/asm/timex.h | 1 + arch/ia64/kernel/head.S | 10 ++++++++-- arch/ia64/kernel/paravirt.c | 1 + arch/ia64/kernel/time.c | 9 +++++++++ 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 56f69f938cca..a73e77add7e2 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -225,6 +225,8 @@ struct pv_time_ops { int (*do_steal_accounting)(unsigned long *new_itm); void (*clocksource_resume)(void); + + unsigned long long (*sched_clock)(void); }; extern struct pv_time_ops pv_time_ops; @@ -242,6 +244,11 @@ paravirt_do_steal_accounting(unsigned long *new_itm) return pv_time_ops.do_steal_accounting(new_itm); } +static inline unsigned long long paravirt_sched_clock(void) +{ + return pv_time_ops.sched_clock(); +} + #endif /* !__ASSEMBLY__ */ #else diff --git a/arch/ia64/include/asm/timex.h b/arch/ia64/include/asm/timex.h index 4e03cfe74a0c..86c7db861180 100644 --- a/arch/ia64/include/asm/timex.h +++ b/arch/ia64/include/asm/timex.h @@ -40,5 +40,6 @@ get_cycles (void) } extern void ia64_cpu_local_tick (void); +extern unsigned long long ia64_native_sched_clock (void); #endif /* _ASM_IA64_TIMEX_H */ diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index 59301c472800..23f846de62d5 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -1050,7 +1050,7 @@ END(ia64_delay_loop) * except that the multiplication and the shift are done with 128-bit * intermediate precision so that we can produce a full 64-bit result. */ -GLOBAL_ENTRY(sched_clock) +GLOBAL_ENTRY(ia64_native_sched_clock) addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 mov.m r9=ar.itc // fetch cycle-counter (35 cyc) ;; @@ -1066,7 +1066,13 @@ GLOBAL_ENTRY(sched_clock) ;; shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT br.ret.sptk.many rp -END(sched_clock) +END(ia64_native_sched_clock) +#ifndef CONFIG_PARAVIRT + //unsigned long long + //sched_clock(void) __attribute__((alias("ia64_native_sched_clock"))); + .global sched_clock +sched_clock = ia64_native_sched_clock +#endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING GLOBAL_ENTRY(cycle_to_cputime) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 9f14c16f6369..6bc33a6db755 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -366,4 +366,5 @@ ia64_native_do_steal_accounting(unsigned long *new_itm) struct pv_time_ops pv_time_ops = { .do_steal_accounting = ia64_native_do_steal_accounting, + .sched_clock = ia64_native_sched_clock, }; diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index f0ebb342409d..c323c7b9c775 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -49,6 +49,15 @@ EXPORT_SYMBOL(last_cli_ip); #endif +#ifdef CONFIG_PARAVIRT +/* We need to define a real function for sched_clock, to override the + weak default version */ +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#endif + #ifdef CONFIG_PARAVIRT static void paravirt_clocksource_resume(void) From 1aec1c558a797512e922581b21a178a05438bfc9 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:41 +0900 Subject: [PATCH 173/478] ia64/pv_ops/xen/pv_time_ops: implement sched_clock. paravirtualize sched_clock. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/xen/time.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c index 68d6204c3f16..fb8332690179 100644 --- a/arch/ia64/xen/time.c +++ b/arch/ia64/xen/time.c @@ -175,10 +175,58 @@ static void xen_itc_jitter_data_reset(void) } while (unlikely(ret != lcycle)); } +/* based on xen_sched_clock() in arch/x86/xen/time.c. */ +/* + * This relies on HAVE_UNSTABLE_SCHED_CLOCK. If it can't be defined, + * something similar logic should be implemented here. + */ +/* + * Xen sched_clock implementation. Returns the number of unstolen + * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED + * states. + */ +static unsigned long long xen_sched_clock(void) +{ + struct vcpu_runstate_info runstate; + + unsigned long long now; + unsigned long long offset; + unsigned long long ret; + + /* + * Ideally sched_clock should be called on a per-cpu basis + * anyway, so preempt should already be disabled, but that's + * not current practice at the moment. + */ + preempt_disable(); + + /* + * both ia64_native_sched_clock() and xen's runstate are + * based on mAR.ITC. So difference of them makes sense. + */ + now = ia64_native_sched_clock(); + + get_runstate_snapshot(&runstate); + + WARN_ON(runstate.state != RUNSTATE_running); + + offset = 0; + if (now > runstate.state_entry_time) + offset = now - runstate.state_entry_time; + ret = runstate.time[RUNSTATE_blocked] + + runstate.time[RUNSTATE_running] + + offset; + + preempt_enable(); + + return ret; +} + struct pv_time_ops xen_time_ops __initdata = { .init_missing_ticks_accounting = xen_init_missing_ticks_accounting, .do_steal_accounting = xen_do_steal_accounting, .clocksource_resume = xen_itc_jitter_data_reset, + .sched_clock = xen_sched_clock, }; /* Called after suspend, to resume time. */ From e4ff5b8f545811008123dd9556a51d814f562fcf Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:42 +0900 Subject: [PATCH 174/478] ia64/pv_ops: gate page paravirtualization. paravirtualize gate page by allowing each pv_ops instances to define its own gate page. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/native/patchlist.h | 38 ++++++++++++ arch/ia64/include/asm/paravirt.h | 35 +++++++++++ arch/ia64/kernel/Makefile | 32 ++-------- arch/ia64/kernel/Makefile.gate | 27 ++++++++ arch/ia64/kernel/gate.lds.S | 17 ++--- arch/ia64/kernel/paravirt_patchlist.c | 79 ++++++++++++++++++++++++ arch/ia64/kernel/paravirt_patchlist.h | 28 +++++++++ arch/ia64/kernel/patch.c | 12 ++-- arch/ia64/mm/init.c | 6 +- 9 files changed, 231 insertions(+), 43 deletions(-) create mode 100644 arch/ia64/include/asm/native/patchlist.h create mode 100644 arch/ia64/kernel/Makefile.gate create mode 100644 arch/ia64/kernel/paravirt_patchlist.c create mode 100644 arch/ia64/kernel/paravirt_patchlist.h diff --git a/arch/ia64/include/asm/native/patchlist.h b/arch/ia64/include/asm/native/patchlist.h new file mode 100644 index 000000000000..be16ca9311bf --- /dev/null +++ b/arch/ia64/include/asm/native/patchlist.h @@ -0,0 +1,38 @@ +/****************************************************************************** + * arch/ia64/include/asm/native/inst.h + * + * Copyright (c) 2008 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#define __paravirt_start_gate_fsyscall_patchlist \ + __ia64_native_start_gate_fsyscall_patchlist +#define __paravirt_end_gate_fsyscall_patchlist \ + __ia64_native_end_gate_fsyscall_patchlist +#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \ + __ia64_native_start_gate_brl_fsys_bubble_down_patchlist +#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \ + __ia64_native_end_gate_brl_fsys_bubble_down_patchlist +#define __paravirt_start_gate_vtop_patchlist \ + __ia64_native_start_gate_vtop_patchlist +#define __paravirt_end_gate_vtop_patchlist \ + __ia64_native_end_gate_vtop_patchlist +#define __paravirt_start_gate_mckinley_e9_patchlist \ + __ia64_native_start_gate_mckinley_e9_patchlist +#define __paravirt_end_gate_mckinley_e9_patchlist \ + __ia64_native_end_gate_mckinley_e9_patchlist diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index a73e77add7e2..fc433f6c3275 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -35,6 +35,41 @@ extern struct pv_fsys_data pv_fsys_data; unsigned long *paravirt_get_fsyscall_table(void); char *paravirt_get_fsys_bubble_down(void); + +/****************************************************************************** + * patchlist addresses for gate page + */ +enum pv_gate_patchlist { + PV_GATE_START_FSYSCALL, + PV_GATE_END_FSYSCALL, + + PV_GATE_START_BRL_FSYS_BUBBLE_DOWN, + PV_GATE_END_BRL_FSYS_BUBBLE_DOWN, + + PV_GATE_START_VTOP, + PV_GATE_END_VTOP, + + PV_GATE_START_MCKINLEY_E9, + PV_GATE_END_MCKINLEY_E9, +}; + +struct pv_patchdata { + unsigned long start_fsyscall_patchlist; + unsigned long end_fsyscall_patchlist; + unsigned long start_brl_fsys_bubble_down_patchlist; + unsigned long end_brl_fsys_bubble_down_patchlist; + unsigned long start_vtop_patchlist; + unsigned long end_vtop_patchlist; + unsigned long start_mckinley_e9_patchlist; + unsigned long end_mckinley_e9_patchlist; + + void *gate_section; +}; + +extern struct pv_patchdata pv_patchdata; + +unsigned long paravirt_get_gate_patchlist(enum pv_gate_patchlist type); +void *paravirt_get_gate_section(void); #endif #ifdef CONFIG_PARAVIRT_GUEST diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 1ab150ec8cea..8dc9df8a87a5 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ - irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ + irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o \ salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ unwind.o mca.o mca_asm.o topology.o @@ -47,35 +47,13 @@ ifeq ($(CONFIG_DMAR), y) obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o endif -# The gate DSO image is built using a special linker script. -targets += gate.so gate-syms.o - -extra-y += gate.so gate-syms.o gate.lds gate.o - # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 -CPPFLAGS_gate.lds := -P -C -U$(ARCH) - -quiet_cmd_gate = GATE $@ - cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ - -GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) -$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE - $(call if_changed,gate) - -$(obj)/built-in.o: $(obj)/gate-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o - -GATECFLAGS_gate-syms.o = -r -$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE - $(call if_changed,gate) - -# gate-data.o contains the gate DSO image as data in section .data.gate. -# We must build gate.so before we can assemble it. -# Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/gate-data.o: $(obj)/gate.so +# The gate DSO image is built using a special linker script. +include $(srctree)/arch/ia64/kernel/Makefile.gate +# tell compiled for native +CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config define sed-y diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate new file mode 100644 index 000000000000..1d87f84069b3 --- /dev/null +++ b/arch/ia64/kernel/Makefile.gate @@ -0,0 +1,27 @@ +# The gate DSO image is built using a special linker script. + +targets += gate.so gate-syms.o + +extra-y += gate.so gate-syms.o gate.lds gate.o + +CPPFLAGS_gate.lds := -P -C -U$(ARCH) + +quiet_cmd_gate = GATE $@ + cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ + +GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) +$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +$(obj)/built-in.o: $(obj)/gate-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o + +GATECFLAGS_gate-syms.o = -r +$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +# gate-data.o contains the gate DSO image as data in section .data.gate. +# We must build gate.so before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/gate-data.o: $(obj)/gate.so diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S index 3cb1abc00e24..88c64ed47c36 100644 --- a/arch/ia64/kernel/gate.lds.S +++ b/arch/ia64/kernel/gate.lds.S @@ -7,6 +7,7 @@ #include +#include "paravirt_patchlist.h" SECTIONS { @@ -33,21 +34,21 @@ SECTIONS . = GATE_ADDR + 0x600; .data.patch : { - __start_gate_mckinley_e9_patchlist = .; + __paravirt_start_gate_mckinley_e9_patchlist = .; *(.data.patch.mckinley_e9) - __end_gate_mckinley_e9_patchlist = .; + __paravirt_end_gate_mckinley_e9_patchlist = .; - __start_gate_vtop_patchlist = .; + __paravirt_start_gate_vtop_patchlist = .; *(.data.patch.vtop) - __end_gate_vtop_patchlist = .; + __paravirt_end_gate_vtop_patchlist = .; - __start_gate_fsyscall_patchlist = .; + __paravirt_start_gate_fsyscall_patchlist = .; *(.data.patch.fsyscall_table) - __end_gate_fsyscall_patchlist = .; + __paravirt_end_gate_fsyscall_patchlist = .; - __start_gate_brl_fsys_bubble_down_patchlist = .; + __paravirt_start_gate_brl_fsys_bubble_down_patchlist = .; *(.data.patch.brl_fsys_bubble_down) - __end_gate_brl_fsys_bubble_down_patchlist = .; + __paravirt_end_gate_brl_fsys_bubble_down_patchlist = .; } :readable .IA_64.unwind_info : { *(.IA_64.unwind_info*) } diff --git a/arch/ia64/kernel/paravirt_patchlist.c b/arch/ia64/kernel/paravirt_patchlist.c new file mode 100644 index 000000000000..b28082a95d45 --- /dev/null +++ b/arch/ia64/kernel/paravirt_patchlist.c @@ -0,0 +1,79 @@ +/****************************************************************************** + * Copyright (c) 2008 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include + +#define DECLARE(name) \ + extern unsigned long \ + __ia64_native_start_gate_##name##_patchlist[]; \ + extern unsigned long \ + __ia64_native_end_gate_##name##_patchlist[] + +DECLARE(fsyscall); +DECLARE(brl_fsys_bubble_down); +DECLARE(vtop); +DECLARE(mckinley_e9); + +extern unsigned long __start_gate_section[]; + +#define ASSIGN(name) \ + .start_##name##_patchlist = \ + (unsigned long)__ia64_native_start_gate_##name##_patchlist, \ + .end_##name##_patchlist = \ + (unsigned long)__ia64_native_end_gate_##name##_patchlist + +struct pv_patchdata pv_patchdata __initdata = { + ASSIGN(fsyscall), + ASSIGN(brl_fsys_bubble_down), + ASSIGN(vtop), + ASSIGN(mckinley_e9), + + .gate_section = (void*)__start_gate_section, +}; + + +unsigned long __init +paravirt_get_gate_patchlist(enum pv_gate_patchlist type) +{ + +#define CASE(NAME, name) \ + case PV_GATE_START_##NAME: \ + return pv_patchdata.start_##name##_patchlist; \ + case PV_GATE_END_##NAME: \ + return pv_patchdata.end_##name##_patchlist; \ + + switch (type) { + CASE(FSYSCALL, fsyscall); + CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down); + CASE(VTOP, vtop); + CASE(MCKINLEY_E9, mckinley_e9); + default: + BUG(); + break; + } + return 0; +} + +void * __init +paravirt_get_gate_section(void) +{ + return pv_patchdata.gate_section; +} diff --git a/arch/ia64/kernel/paravirt_patchlist.h b/arch/ia64/kernel/paravirt_patchlist.h new file mode 100644 index 000000000000..0684aa6c6507 --- /dev/null +++ b/arch/ia64/kernel/paravirt_patchlist.h @@ -0,0 +1,28 @@ +/****************************************************************************** + * linux/arch/ia64/xen/paravirt_patchlist.h + * + * Copyright (c) 2008 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#if defined(__IA64_GATE_PARAVIRTUALIZED_XEN) +#include +#else +#include +#endif + diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c index 02dd977436fc..64c6f95daa34 100644 --- a/arch/ia64/kernel/patch.c +++ b/arch/ia64/kernel/patch.c @@ -227,13 +227,13 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) void __init ia64_patch_gate (void) { -# define START(name) ((unsigned long) __start_gate_##name##_patchlist) -# define END(name) ((unsigned long)__end_gate_##name##_patchlist) +# define START(name) paravirt_get_gate_patchlist(PV_GATE_START_##name) +# define END(name) paravirt_get_gate_patchlist(PV_GATE_END_##name) - patch_fsyscall_table(START(fsyscall), END(fsyscall)); - patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down)); - ia64_patch_vtop(START(vtop), END(vtop)); - ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9)); + patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL)); + patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN)); + ia64_patch_vtop(START(VTOP), END(VTOP)); + ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9)); } void ia64_patch_phys_stack_reg(unsigned long val) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index c9bc5b305ffa..8503d534794f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -260,6 +260,7 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) static void __init setup_gate (void) { + void *gate_section; struct page *page; /* @@ -267,10 +268,11 @@ setup_gate (void) * headers etc. and once execute-only page to enable * privilege-promotion via "epc": */ - page = virt_to_page(ia64_imva(__start_gate_section)); + gate_section = paravirt_get_gate_section(); + page = virt_to_page(ia64_imva(gate_section)); put_kernel_page(page, GATE_ADDR, PAGE_READONLY); #ifdef HAVE_BUGGY_SEGREL - page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE)); + page = virt_to_page(ia64_imva(gate_section + PAGE_SIZE)); put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE); #else put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE); From b937dd76d07f2347684d6cc1e1ec4e2746417357 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:43 +0900 Subject: [PATCH 175/478] ia64/pv_ops/xen: define xen specific gate page. define xen specific gate page. At this phase bits in the gate page is same to native. At the next phase, it will be paravirtualized. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/xen/patchlist.h | 38 +++++++++++++++++++++++++++ arch/ia64/kernel/vmlinux.lds.S | 6 +++++ arch/ia64/xen/Makefile | 16 ++++++++++- arch/ia64/xen/gate-data.S | 3 +++ arch/ia64/xen/xen_pv_ops.c | 32 ++++++++++++++++++++++ 5 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 arch/ia64/include/asm/xen/patchlist.h create mode 100644 arch/ia64/xen/gate-data.S diff --git a/arch/ia64/include/asm/xen/patchlist.h b/arch/ia64/include/asm/xen/patchlist.h new file mode 100644 index 000000000000..eae944e88846 --- /dev/null +++ b/arch/ia64/include/asm/xen/patchlist.h @@ -0,0 +1,38 @@ +/****************************************************************************** + * arch/ia64/include/asm/xen/patchlist.h + * + * Copyright (c) 2008 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#define __paravirt_start_gate_fsyscall_patchlist \ + __xen_start_gate_fsyscall_patchlist +#define __paravirt_end_gate_fsyscall_patchlist \ + __xen_end_gate_fsyscall_patchlist +#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \ + __xen_start_gate_brl_fsys_bubble_down_patchlist +#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \ + __xen_end_gate_brl_fsys_bubble_down_patchlist +#define __paravirt_start_gate_vtop_patchlist \ + __xen_start_gate_vtop_patchlist +#define __paravirt_end_gate_vtop_patchlist \ + __xen_end_gate_vtop_patchlist +#define __paravirt_start_gate_mckinley_e9_patchlist \ + __xen_start_gate_mckinley_e9_patchlist +#define __paravirt_end_gate_mckinley_e9_patchlist \ + __xen_end_gate_mckinley_e9_patchlist diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 10a7d47e8510..92ae7e8f0142 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -201,6 +201,12 @@ SECTIONS __start_gate_section = .; *(.data.gate) __stop_gate_section = .; +#ifdef CONFIG_XEN + . = ALIGN(PAGE_SIZE); + __xen_start_gate_section = .; + *(.data.gate.xen) + __xen_stop_gate_section = .; +#endif } . = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose * kernel data diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile index b4ca2e6c0ead..94f0d8e7d9de 100644 --- a/arch/ia64/xen/Makefile +++ b/arch/ia64/xen/Makefile @@ -3,10 +3,24 @@ # obj-y := hypercall.o xenivt.o xensetup.o xen_pv_ops.o irq_xen.o \ - hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o + hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o \ + gate-data.o obj-$(CONFIG_IA64_GENERIC) += machvec.o +# The gate DSO image is built using a special linker script. +include $(srctree)/arch/ia64/kernel/Makefile.gate + +# tell compiled for xen +CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN + +# use same file of native. +$(obj)/gate.o: $(src)/../kernel/gate.S FORCE + $(call if_changed_dep,as_o_S) +$(obj)/gate.lds: $(src)/../kernel/gate.lds.S FORCE + $(call if_changed_dep,cpp_lds_S) + + AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN # xen multi compile diff --git a/arch/ia64/xen/gate-data.S b/arch/ia64/xen/gate-data.S new file mode 100644 index 000000000000..7d4830afc91d --- /dev/null +++ b/arch/ia64/xen/gate-data.S @@ -0,0 +1,3 @@ + .section .data.gate.xen, "aw" + + .incbin "arch/ia64/xen/gate.so" diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c index d91336114344..bdf1acbce81c 100644 --- a/arch/ia64/xen/xen_pv_ops.c +++ b/arch/ia64/xen/xen_pv_ops.c @@ -178,6 +178,37 @@ struct pv_fsys_data xen_fsys_data __initdata = { .fsys_bubble_down = (void *)xen_fsys_bubble_down, }; +/*************************************************************************** + * pv_patchdata + * patchdata addresses + */ + +#define DECLARE(name) \ + extern unsigned long __xen_start_gate_##name##_patchlist[]; \ + extern unsigned long __xen_end_gate_##name##_patchlist[] + +DECLARE(fsyscall); +DECLARE(brl_fsys_bubble_down); +DECLARE(vtop); +DECLARE(mckinley_e9); + +extern unsigned long __xen_start_gate_section[]; + +#define ASSIGN(name) \ + .start_##name##_patchlist = \ + (unsigned long)__xen_start_gate_##name##_patchlist, \ + .end_##name##_patchlist = \ + (unsigned long)__xen_end_gate_##name##_patchlist + +static struct pv_patchdata xen_patchdata __initdata = { + ASSIGN(fsyscall), + ASSIGN(brl_fsys_bubble_down), + ASSIGN(vtop), + ASSIGN(mckinley_e9), + + .gate_section = (void*)__xen_start_gate_section, +}; + /*************************************************************************** * pv_cpu_ops * intrinsics hooks. @@ -447,6 +478,7 @@ xen_setup_pv_ops(void) pv_info = xen_info; pv_init_ops = xen_init_ops; pv_fsys_data = xen_fsys_data; + pv_patchdata = xen_patchdata; pv_cpu_ops = xen_cpu_ops; pv_iosapic_ops = xen_iosapic_ops; pv_irq_ops = xen_irq_ops; From 53129c5c553f8d0c45f12f15742ac112e8605ab5 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:44 +0900 Subject: [PATCH 176/478] ia64/pv_ops: move down __kernel_syscall_via_epc. Move down __kernel_syscall_via_epc to the end of the page. We want to paravirtualize only __kernel_syscall_via_epc because it includes privileged instructions. Its paravirtualization increases its symbols size. On the other hand, each paravirtualized gate must have e symbols of same value and size to native's because the page is mapped to GATE_ADDR and GATE_ADDR + PERCPU_PAGE_SIZE and vmlinux is linked to those symbols. Later to have the same symbol size, we pads NOPs at the end of __kernel_syscall_via_epc. Move it after other functions to keep symbols of other functions have same values and sizes. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/kernel/gate.S | 162 ++++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S index 74b1ccce4e84..c957228e3f1d 100644 --- a/arch/ia64/kernel/gate.S +++ b/arch/ia64/kernel/gate.S @@ -48,87 +48,6 @@ GLOBAL_ENTRY(__kernel_syscall_via_break) } END(__kernel_syscall_via_break) -/* - * On entry: - * r11 = saved ar.pfs - * r15 = system call # - * b0 = saved return address - * b6 = return address - * On exit: - * r11 = saved ar.pfs - * r15 = system call # - * b0 = saved return address - * all other "scratch" registers: undefined - * all "preserved" registers: same as on entry - */ - -GLOBAL_ENTRY(__kernel_syscall_via_epc) - .prologue - .altrp b6 - .body -{ - /* - * Note: the kernel cannot assume that the first two instructions in this - * bundle get executed. The remaining code must be safe even if - * they do not get executed. - */ - adds r17=-1024,r15 // A - mov r10=0 // A default to successful syscall execution - epc // B causes split-issue -} - ;; - rsm psr.be | psr.i // M2 (5 cyc to srlz.d) - LOAD_FSYSCALL_TABLE(r14) // X - ;; - mov r16=IA64_KR(CURRENT) // M2 (12 cyc) - shladd r18=r17,3,r14 // A - mov r19=NR_syscalls-1 // A - ;; - lfetch [r18] // M0|1 - mov r29=psr // M2 (12 cyc) - // If r17 is a NaT, p6 will be zero - cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? - ;; - mov r21=ar.fpsr // M2 (12 cyc) - tnat.nz p10,p9=r15 // I0 - mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) - ;; - srlz.d // M0 (forces split-issue) ensure PSR.BE==0 -(p6) ld8 r18=[r18] // M0|1 - nop.i 0 - ;; - nop.m 0 -(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) - nop.i 0 - ;; -(p8) ssm psr.i -(p6) mov b7=r18 // I0 -(p8) br.dptk.many b7 // B - - mov r27=ar.rsc // M2 (12 cyc) -/* - * brl.cond doesn't work as intended because the linker would convert this branch - * into a branch to a PLT. Perhaps there will be a way to avoid this with some - * future version of the linker. In the meantime, we just use an indirect branch - * instead. - */ -#ifdef CONFIG_ITANIUM -(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry - ;; -(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down - ;; -(p6) mov b7=r14 -(p6) br.sptk.many b7 -#else - BRL_COND_FSYS_BUBBLE_DOWN(p6) -#endif - ssm psr.i - mov r10=-1 -(p10) mov r8=EINVAL -(p9) mov r8=ENOSYS - FSYS_RETURN -END(__kernel_syscall_via_epc) - # define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) # define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET) # define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET) @@ -374,3 +293,84 @@ restore_rbs: // invala not necessary as that will happen when returning to user-mode br.cond.sptk back_from_restore_rbs END(__kernel_sigtramp) + +/* + * On entry: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * b6 = return address + * On exit: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * all other "scratch" registers: undefined + * all "preserved" registers: same as on entry + */ + +GLOBAL_ENTRY(__kernel_syscall_via_epc) + .prologue + .altrp b6 + .body +{ + /* + * Note: the kernel cannot assume that the first two instructions in this + * bundle get executed. The remaining code must be safe even if + * they do not get executed. + */ + adds r17=-1024,r15 // A + mov r10=0 // A default to successful syscall execution + epc // B causes split-issue +} + ;; + rsm psr.be | psr.i // M2 (5 cyc to srlz.d) + LOAD_FSYSCALL_TABLE(r14) // X + ;; + mov r16=IA64_KR(CURRENT) // M2 (12 cyc) + shladd r18=r17,3,r14 // A + mov r19=NR_syscalls-1 // A + ;; + lfetch [r18] // M0|1 + mov r29=psr // M2 (12 cyc) + // If r17 is a NaT, p6 will be zero + cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? + ;; + mov r21=ar.fpsr // M2 (12 cyc) + tnat.nz p10,p9=r15 // I0 + mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) + ;; + srlz.d // M0 (forces split-issue) ensure PSR.BE==0 +(p6) ld8 r18=[r18] // M0|1 + nop.i 0 + ;; + nop.m 0 +(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) + nop.i 0 + ;; +(p8) ssm psr.i +(p6) mov b7=r18 // I0 +(p8) br.dptk.many b7 // B + + mov r27=ar.rsc // M2 (12 cyc) +/* + * brl.cond doesn't work as intended because the linker would convert this branch + * into a branch to a PLT. Perhaps there will be a way to avoid this with some + * future version of the linker. In the meantime, we just use an indirect branch + * instead. + */ +#ifdef CONFIG_ITANIUM +(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry + ;; +(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down + ;; +(p6) mov b7=r14 +(p6) br.sptk.many b7 +#else + BRL_COND_FSYS_BUBBLE_DOWN(p6) +#endif + ssm psr.i + mov r10=-1 +(p10) mov r8=EINVAL +(p9) mov r8=ENOSYS + FSYS_RETURN +END(__kernel_syscall_via_epc) From c4312511ba1f3a08f2f64ca8335882ef56ff9bdd Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:45 +0900 Subject: [PATCH 177/478] ia64/pv_ops: paravirtualize gate.S. paravirtualize gate.S. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/native/inst.h | 5 +++++ arch/ia64/include/asm/native/pvchk_inst.h | 3 +++ arch/ia64/kernel/gate.S | 17 +++++++++++++---- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h index ad59fc6a6134..d2d46efb3e6e 100644 --- a/arch/ia64/include/asm/native/inst.h +++ b/arch/ia64/include/asm/native/inst.h @@ -166,6 +166,11 @@ #define RSM_PSR_DT \ rsm psr.dt +#define RSM_PSR_BE_I(clob0, clob1) \ + rsm psr.be | psr.i \ + CLOBBER(clob0) \ + CLOBBER(clob1) + #define SSM_PSR_DT_AND_SRLZ_I \ ssm psr.dt \ ;; \ diff --git a/arch/ia64/include/asm/native/pvchk_inst.h b/arch/ia64/include/asm/native/pvchk_inst.h index 13b289ea6173..8d72962ec838 100644 --- a/arch/ia64/include/asm/native/pvchk_inst.h +++ b/arch/ia64/include/asm/native/pvchk_inst.h @@ -251,6 +251,9 @@ IS_RREG_CLOB(clob2) #define RSM_PSR_DT \ nop 0 +#define RSM_PSR_BE_I(clob0, clob1) \ + IS_RREG_CLOB(clob0) \ + IS_RREG_CLOB(clob1) #define SSM_PSR_DT_AND_SRLZ_I \ nop 0 #define BSW_0(clob0, clob1, clob2) \ diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S index c957228e3f1d..cf5e0a105e16 100644 --- a/arch/ia64/kernel/gate.S +++ b/arch/ia64/kernel/gate.S @@ -13,6 +13,7 @@ #include #include #include +#include "paravirt_inst.h" /* * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation, @@ -323,7 +324,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) epc // B causes split-issue } ;; - rsm psr.be | psr.i // M2 (5 cyc to srlz.d) + RSM_PSR_BE_I(r20, r22) // M2 (5 cyc to srlz.d) LOAD_FSYSCALL_TABLE(r14) // X ;; mov r16=IA64_KR(CURRENT) // M2 (12 cyc) @@ -331,7 +332,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) mov r19=NR_syscalls-1 // A ;; lfetch [r18] // M0|1 - mov r29=psr // M2 (12 cyc) + MOV_FROM_PSR(p0, r29, r8) // M2 (12 cyc) // If r17 is a NaT, p6 will be zero cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? ;; @@ -347,7 +348,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) (p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) nop.i 0 ;; -(p8) ssm psr.i + SSM_PSR_I(p8, p14, r25) (p6) mov b7=r18 // I0 (p8) br.dptk.many b7 // B @@ -368,9 +369,17 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) #else BRL_COND_FSYS_BUBBLE_DOWN(p6) #endif - ssm psr.i + SSM_PSR_I(p0, p14, r10) mov r10=-1 (p10) mov r8=EINVAL (p9) mov r8=ENOSYS FSYS_RETURN + +#ifdef CONFIG_PARAVIRT + /* + * padd to make the size of this symbol constant + * independent of paravirtualization. + */ + .align PAGE_SIZE / 8 +#endif END(__kernel_syscall_via_epc) From f8de2ec678fa09276cf7ad02838eb80e86c73097 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:05:46 +0900 Subject: [PATCH 178/478] ia64/pv_ops/xen/gate.S: xen gate page paravirtualization xen gate page paravirtualization Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/xen/inst.h | 4 ++++ arch/ia64/xen/Makefile | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h index 90537dc15efe..c53a47611208 100644 --- a/arch/ia64/include/asm/xen/inst.h +++ b/arch/ia64/include/asm/xen/inst.h @@ -386,6 +386,10 @@ #define RSM_PSR_DT \ XEN_HYPER_RSM_PSR_DT +#define RSM_PSR_BE_I(clob0, clob1) \ + RSM_PSR_I(p0, clob0, clob1); \ + rum psr.be + #define SSM_PSR_DT_AND_SRLZ_I \ XEN_HYPER_SSM_PSR_DT diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile index 94f0d8e7d9de..e6f4a0a74228 100644 --- a/arch/ia64/xen/Makefile +++ b/arch/ia64/xen/Makefile @@ -13,6 +13,7 @@ include $(srctree)/arch/ia64/kernel/Makefile.gate # tell compiled for xen CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN +AFLAGS_gate.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN -D__IA64_GATE_PARAVIRTUALIZED_XEN # use same file of native. $(obj)/gate.o: $(src)/../kernel/gate.S FORCE From bf7ab02f620c1020c869fc71a2c855918b6a5375 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:06:51 +0900 Subject: [PATCH 179/478] ia64/pv_op/binarypatch: add helper functions to support binary patching for paravirt_ops. add helper functions to support binary patching for paravirt_ops. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/paravirt_patch.h | 143 +++++++ arch/ia64/kernel/paravirt_patch.c | 514 +++++++++++++++++++++++++ arch/ia64/kernel/paravirtentry.S | 56 +++ arch/ia64/kernel/vmlinux.lds.S | 24 ++ 4 files changed, 737 insertions(+) create mode 100644 arch/ia64/include/asm/paravirt_patch.h create mode 100644 arch/ia64/kernel/paravirt_patch.c diff --git a/arch/ia64/include/asm/paravirt_patch.h b/arch/ia64/include/asm/paravirt_patch.h new file mode 100644 index 000000000000..128ff5db6e67 --- /dev/null +++ b/arch/ia64/include/asm/paravirt_patch.h @@ -0,0 +1,143 @@ +/****************************************************************************** + * Copyright (c) 2008 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __ASM_PARAVIRT_PATCH_H +#define __ASM_PARAVIRT_PATCH_H + +#ifdef __ASSEMBLY__ + + .section .paravirt_branches, "a" + .previous +#define PARAVIRT_PATCH_SITE_BR(type) \ + { \ + [1:] ; \ + br.cond.sptk.many 2f ; \ + nop.b 0 ; \ + nop.b 0;; ; \ + } ; \ + 2: \ + .xdata8 ".paravirt_branches", 1b, type + +#else + +#include +#include + +/* for binary patch */ +struct paravirt_patch_site_bundle { + void *sbundle; + void *ebundle; + unsigned long type; +}; + +/* label means the beginning of new bundle */ +#define paravirt_alt_bundle(instr, privop) \ + "\t998:\n" \ + "\t" instr "\n" \ + "\t999:\n" \ + "\t.pushsection .paravirt_bundles, \"a\"\n" \ + "\t.popsection\n" \ + "\t.xdata8 \".paravirt_bundles\", 998b, 999b, " \ + __stringify(privop) "\n" + + +struct paravirt_patch_bundle_elem { + const void *sbundle; + const void *ebundle; + unsigned long type; +}; + + +struct paravirt_patch_site_inst { + unsigned long stag; + unsigned long etag; + unsigned long type; +}; + +#define paravirt_alt_inst(instr, privop) \ + "\t[998:]\n" \ + "\t" instr "\n" \ + "\t[999:]\n" \ + "\t.pushsection .paravirt_insts, \"a\"\n" \ + "\t.popsection\n" \ + "\t.xdata8 \".paravirt_insts\", 998b, 999b, " \ + __stringify(privop) "\n" + +struct paravirt_patch_site_branch { + unsigned long tag; + unsigned long type; +}; + +struct paravirt_patch_branch_target { + const void *entry; + unsigned long type; +}; + +void +__paravirt_patch_apply_branch( + unsigned long tag, unsigned long type, + const struct paravirt_patch_branch_target *entries, + unsigned int nr_entries); + +void +paravirt_patch_reloc_br(unsigned long tag, const void *target); + +void +paravirt_patch_reloc_brl(unsigned long tag, const void *target); + + +#if defined(ASM_SUPPORTED) && defined(CONFIG_PARAVIRT) +unsigned long +ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type); + +unsigned long +__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type, + const struct paravirt_patch_bundle_elem *elems, + unsigned long nelems, + const struct paravirt_patch_bundle_elem **found); + +void +paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start, + const struct paravirt_patch_site_bundle *end); + +void +paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start, + const struct paravirt_patch_site_inst *end); + +void paravirt_patch_apply(void); +#else +#define paravirt_patch_apply_bundle(start, end) do { } while (0) +#define paravirt_patch_apply_inst(start, end) do { } while (0) +#define paravirt_patch_apply() do { } while (0) +#endif + +#endif /* !__ASSEMBLEY__ */ + +#endif /* __ASM_PARAVIRT_PATCH_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "linux" + * c-basic-offset: 8 + * tab-width: 8 + * indent-tabs-mode: t + * End: + */ diff --git a/arch/ia64/kernel/paravirt_patch.c b/arch/ia64/kernel/paravirt_patch.c new file mode 100644 index 000000000000..bfdfef1b1ffd --- /dev/null +++ b/arch/ia64/kernel/paravirt_patch.c @@ -0,0 +1,514 @@ +/****************************************************************************** + * linux/arch/ia64/xen/paravirt_patch.c + * + * Copyright (c) 2008 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include + +typedef union ia64_inst { + struct { + unsigned long long qp : 6; + unsigned long long : 31; + unsigned long long opcode : 4; + unsigned long long reserved : 23; + } generic; + unsigned long long l; +} ia64_inst_t; + +/* + * flush_icache_range() can't be used here. + * we are here before cpu_init() which initializes + * ia64_i_cache_stride_shift. flush_icache_range() uses it. + */ +void __init_or_module +paravirt_flush_i_cache_range(const void *instr, unsigned long size) +{ + extern void paravirt_fc_i(const void *addr); + unsigned long i; + + for (i = 0; i < size; i += sizeof(bundle_t)) + paravirt_fc_i(instr + i); +} + +bundle_t* __init_or_module +paravirt_get_bundle(unsigned long tag) +{ + return (bundle_t *)(tag & ~3UL); +} + +unsigned long __init_or_module +paravirt_get_slot(unsigned long tag) +{ + return tag & 3UL; +} + +unsigned long __init_or_module +paravirt_get_num_inst(unsigned long stag, unsigned long etag) +{ + bundle_t *sbundle = paravirt_get_bundle(stag); + unsigned long sslot = paravirt_get_slot(stag); + bundle_t *ebundle = paravirt_get_bundle(etag); + unsigned long eslot = paravirt_get_slot(etag); + + return (ebundle - sbundle) * 3 + eslot - sslot + 1; +} + +unsigned long __init_or_module +paravirt_get_next_tag(unsigned long tag) +{ + unsigned long slot = paravirt_get_slot(tag); + + switch (slot) { + case 0: + case 1: + return tag + 1; + case 2: { + bundle_t *bundle = paravirt_get_bundle(tag); + return (unsigned long)(bundle + 1); + } + default: + BUG(); + } + /* NOTREACHED */ +} + +ia64_inst_t __init_or_module +paravirt_read_slot0(const bundle_t *bundle) +{ + ia64_inst_t inst; + inst.l = bundle->quad0.slot0; + return inst; +} + +ia64_inst_t __init_or_module +paravirt_read_slot1(const bundle_t *bundle) +{ + ia64_inst_t inst; + inst.l = bundle->quad0.slot1_p0 | + ((unsigned long long)bundle->quad1.slot1_p1 << 18UL); + return inst; +} + +ia64_inst_t __init_or_module +paravirt_read_slot2(const bundle_t *bundle) +{ + ia64_inst_t inst; + inst.l = bundle->quad1.slot2; + return inst; +} + +ia64_inst_t __init_or_module +paravirt_read_inst(unsigned long tag) +{ + bundle_t *bundle = paravirt_get_bundle(tag); + unsigned long slot = paravirt_get_slot(tag); + + switch (slot) { + case 0: + return paravirt_read_slot0(bundle); + case 1: + return paravirt_read_slot1(bundle); + case 2: + return paravirt_read_slot2(bundle); + default: + BUG(); + } + /* NOTREACHED */ +} + +void __init_or_module +paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst) +{ + bundle->quad0.slot0 = inst.l; +} + +void __init_or_module +paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst) +{ + bundle->quad0.slot1_p0 = inst.l; + bundle->quad1.slot1_p1 = inst.l >> 18UL; +} + +void __init_or_module +paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst) +{ + bundle->quad1.slot2 = inst.l; +} + +void __init_or_module +paravirt_write_inst(unsigned long tag, ia64_inst_t inst) +{ + bundle_t *bundle = paravirt_get_bundle(tag); + unsigned long slot = paravirt_get_slot(tag); + + switch (slot) { + case 0: + paravirt_write_slot0(bundle, inst); + break; + case 1: + paravirt_write_slot1(bundle, inst); + break; + case 2: + paravirt_write_slot2(bundle, inst); + break; + default: + BUG(); + break; + } + paravirt_flush_i_cache_range(bundle, sizeof(*bundle)); +} + +/* for debug */ +void +paravirt_print_bundle(const bundle_t *bundle) +{ + const unsigned long *quad = (const unsigned long *)bundle; + ia64_inst_t slot0 = paravirt_read_slot0(bundle); + ia64_inst_t slot1 = paravirt_read_slot1(bundle); + ia64_inst_t slot2 = paravirt_read_slot2(bundle); + + printk(KERN_DEBUG + "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]); + printk(KERN_DEBUG + "bundle template 0x%x\n", + bundle->quad0.template); + printk(KERN_DEBUG + "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n", + (unsigned long)bundle->quad0.slot0, + (unsigned long)bundle->quad0.slot1_p0, + (unsigned long)bundle->quad1.slot1_p1, + (unsigned long)bundle->quad1.slot2); + printk(KERN_DEBUG + "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n", + slot0.l, slot1.l, slot2.l); +} + +static int noreplace_paravirt __init_or_module = 0; + +static int __init setup_noreplace_paravirt(char *str) +{ + noreplace_paravirt = 1; + return 1; +} +__setup("noreplace-paravirt", setup_noreplace_paravirt); + +#ifdef ASM_SUPPORTED +static void __init_or_module +fill_nop_bundle(void *sbundle, void *ebundle) +{ + extern const char paravirt_nop_bundle[]; + extern const unsigned long paravirt_nop_bundle_size; + + void *bundle = sbundle; + + BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0); + BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0); + + while (bundle < ebundle) { + memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size); + + bundle += paravirt_nop_bundle_size; + } +} + +/* helper function */ +unsigned long __init_or_module +__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type, + const struct paravirt_patch_bundle_elem *elems, + unsigned long nelems, + const struct paravirt_patch_bundle_elem **found) +{ + unsigned long used = 0; + unsigned long i; + + BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0); + BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0); + + found = NULL; + for (i = 0; i < nelems; i++) { + const struct paravirt_patch_bundle_elem *p = &elems[i]; + if (p->type == type) { + unsigned long need = p->ebundle - p->sbundle; + unsigned long room = ebundle - sbundle; + + if (found != NULL) + *found = p; + + if (room < need) { + /* no room to replace. skip it */ + printk(KERN_DEBUG + "the space is too small to put " + "bundles. type %ld need %ld room %ld\n", + type, need, room); + break; + } + + used = need; + memcpy(sbundle, p->sbundle, used); + break; + } + } + + return used; +} + +void __init_or_module +paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start, + const struct paravirt_patch_site_bundle *end) +{ + const struct paravirt_patch_site_bundle *p; + + if (noreplace_paravirt) + return; + if (pv_init_ops.patch_bundle == NULL) + return; + + for (p = start; p < end; p++) { + unsigned long used; + + used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle, + p->type); + if (used == 0) + continue; + + fill_nop_bundle(p->sbundle + used, p->ebundle); + paravirt_flush_i_cache_range(p->sbundle, + p->ebundle - p->sbundle); + } + ia64_sync_i(); + ia64_srlz_i(); +} + +/* + * nop.i, nop.m, nop.f instruction are same format. + * but nop.b has differennt format. + * This doesn't support nop.b for now. + */ +static void __init_or_module +fill_nop_inst(unsigned long stag, unsigned long etag) +{ + extern const bundle_t paravirt_nop_mfi_inst_bundle[]; + unsigned long tag; + const ia64_inst_t nop_inst = + paravirt_read_slot0(paravirt_nop_mfi_inst_bundle); + + for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag)) + paravirt_write_inst(tag, nop_inst); +} + +void __init_or_module +paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start, + const struct paravirt_patch_site_inst *end) +{ + const struct paravirt_patch_site_inst *p; + + if (noreplace_paravirt) + return; + if (pv_init_ops.patch_inst == NULL) + return; + + for (p = start; p < end; p++) { + unsigned long tag; + bundle_t *sbundle; + bundle_t *ebundle; + + tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type); + if (tag == p->stag) + continue; + + fill_nop_inst(tag, p->etag); + sbundle = paravirt_get_bundle(p->stag); + ebundle = paravirt_get_bundle(p->etag) + 1; + paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) * + sizeof(bundle_t)); + } + ia64_sync_i(); + ia64_srlz_i(); +} +#endif /* ASM_SUPPOTED */ + +/* brl.cond.sptk.many X3 */ +typedef union inst_x3_op { + ia64_inst_t inst; + struct { + unsigned long qp: 6; + unsigned long btyp: 3; + unsigned long unused: 3; + unsigned long p: 1; + unsigned long imm20b: 20; + unsigned long wh: 2; + unsigned long d: 1; + unsigned long i: 1; + unsigned long opcode: 4; + }; + unsigned long l; +} inst_x3_op_t; + +typedef union inst_x3_imm { + ia64_inst_t inst; + struct { + unsigned long unused: 2; + unsigned long imm39: 39; + }; + unsigned long l; +} inst_x3_imm_t; + +void __init_or_module +paravirt_patch_reloc_brl(unsigned long tag, const void *target) +{ + unsigned long tag_op = paravirt_get_next_tag(tag); + unsigned long tag_imm = tag; + bundle_t *bundle = paravirt_get_bundle(tag); + + ia64_inst_t inst_op = paravirt_read_inst(tag_op); + ia64_inst_t inst_imm = paravirt_read_inst(tag_imm); + + inst_x3_op_t inst_x3_op = { .l = inst_op.l }; + inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l }; + + unsigned long imm60 = + ((unsigned long)target - (unsigned long)bundle) >> 4; + + BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */ + BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0); + + /* imm60[59] 1bit */ + inst_x3_op.i = (imm60 >> 59) & 1; + /* imm60[19:0] 20bit */ + inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1); + /* imm60[58:20] 39bit */ + inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1); + + inst_op.l = inst_x3_op.l; + inst_imm.l = inst_x3_imm.l; + + paravirt_write_inst(tag_op, inst_op); + paravirt_write_inst(tag_imm, inst_imm); +} + +/* br.cond.sptk.many B1 */ +typedef union inst_b1 { + ia64_inst_t inst; + struct { + unsigned long qp: 6; + unsigned long btype: 3; + unsigned long unused: 3; + unsigned long p: 1; + unsigned long imm20b: 20; + unsigned long wh: 2; + unsigned long d: 1; + unsigned long s: 1; + unsigned long opcode: 4; + }; + unsigned long l; +} inst_b1_t; + +void __init +paravirt_patch_reloc_br(unsigned long tag, const void *target) +{ + bundle_t *bundle = paravirt_get_bundle(tag); + ia64_inst_t inst = paravirt_read_inst(tag); + unsigned long target25 = (unsigned long)target - (unsigned long)bundle; + inst_b1_t inst_b1; + + BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0); + + inst_b1.l = inst.l; + if (target25 & (1UL << 63)) + inst_b1.s = 1; + else + inst_b1.s = 0; + + inst_b1.imm20b = target25 >> 4; + inst.l = inst_b1.l; + + paravirt_write_inst(tag, inst); +} + +void __init +__paravirt_patch_apply_branch( + unsigned long tag, unsigned long type, + const struct paravirt_patch_branch_target *entries, + unsigned int nr_entries) +{ + unsigned int i; + for (i = 0; i < nr_entries; i++) { + if (entries[i].type == type) { + paravirt_patch_reloc_br(tag, entries[i].entry); + break; + } + } +} + +static void __init +paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start, + const struct paravirt_patch_site_branch *end) +{ + const struct paravirt_patch_site_branch *p; + + if (noreplace_paravirt) + return; + if (pv_init_ops.patch_branch == NULL) + return; + + for (p = start; p < end; p++) + (*pv_init_ops.patch_branch)(p->tag, p->type); + + ia64_sync_i(); + ia64_srlz_i(); +} + +void __init +paravirt_patch_apply(void) +{ + extern const char __start_paravirt_bundles[]; + extern const char __stop_paravirt_bundles[]; + extern const char __start_paravirt_insts[]; + extern const char __stop_paravirt_insts[]; + extern const char __start_paravirt_branches[]; + extern const char __stop_paravirt_branches[]; + + paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *) + __start_paravirt_bundles, + (const struct paravirt_patch_site_bundle *) + __stop_paravirt_bundles); + paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *) + __start_paravirt_insts, + (const struct paravirt_patch_site_inst *) + __stop_paravirt_insts); + paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *) + __start_paravirt_branches, + (const struct paravirt_patch_site_branch *) + __stop_paravirt_branches); +} + +/* + * Local variables: + * mode: C + * c-set-style: "linux" + * c-basic-offset: 8 + * tab-width: 8 + * indent-tabs-mode: t + * End: + */ diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S index 2f42fcb9776a..80c0d365cbc0 100644 --- a/arch/ia64/kernel/paravirtentry.S +++ b/arch/ia64/kernel/paravirtentry.S @@ -58,3 +58,59 @@ BRANCH_PROC(switch_to, r22, b7) BRANCH_PROC_UNWINFO(leave_syscall, r22, b7) BRANCH_PROC(work_processed_syscall, r2, b7) BRANCH_PROC_UNWINFO(leave_kernel, r22, b7) + + +#ifdef CONFIG_MODULES +#define __INIT_OR_MODULE .text +#define __INITDATA_OR_MODULE .data +#else +#define __INIT_OR_MODULE __INIT +#define __INITDATA_OR_MODULE __INITDATA +#endif /* CONFIG_MODULES */ + + __INIT_OR_MODULE + GLOBAL_ENTRY(paravirt_fc_i) + fc.i r32 + br.ret.sptk.many rp + END(paravirt_fc_i) + __FINIT + + __INIT_OR_MODULE + .align 32 + GLOBAL_ENTRY(paravirt_nop_b_inst_bundle) + { + nop.b 0 + nop.b 0 + nop.b 0 + } + END(paravirt_nop_b_inst_bundle) + __FINIT + + /* NOTE: nop.[mfi] has same format */ + __INIT_OR_MODULE + GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle) + { + nop.m 0 + nop.f 0 + nop.i 0 + } + END(paravirt_nop_mfi_inst_bundle) + __FINIT + + __INIT_OR_MODULE + GLOBAL_ENTRY(paravirt_nop_bundle) +paravirt_nop_bundle_start: + { + nop 0 + nop 0 + nop 0 + } +paravirt_nop_bundle_end: + END(paravirt_nop_bundle) + __FINIT + + __INITDATA_OR_MODULE + .align 8 + .global paravirt_nop_bundle_size +paravirt_nop_bundle_size: + data8 paravirt_nop_bundle_end - paravirt_nop_bundle_start diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 92ae7e8f0142..794d168bc8a4 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -169,6 +169,30 @@ SECTIONS __end___mckinley_e9_bundles = .; } +#if defined(CONFIG_PARAVIRT) + . = ALIGN(16); + .paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET) + { + __start_paravirt_bundles = .; + *(.paravirt_bundles) + __stop_paravirt_bundles = .; + } + . = ALIGN(16); + .paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET) + { + __start_paravirt_insts = .; + *(.paravirt_insts) + __stop_paravirt_insts = .; + } + . = ALIGN(16); + .paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET) + { + __start_paravirt_branches = .; + *(.paravirt_branches) + __stop_paravirt_branches = .; + } +#endif + #if defined(CONFIG_IA64_GENERIC) /* Machine Vector */ . = ALIGN(16); From 03f511dd02f1431ef652fb97a7f2fe7aef47e025 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:06:52 +0900 Subject: [PATCH 180/478] ia64/pv_ops: implement binary patching optimization for native. implement binary patching optimization for pv_cpu_ops. With this optimization, indirect call for pv_cpu_ops methods can be converted into inline execution or direct call. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/intrinsics.h | 6 +- arch/ia64/include/asm/paravirt.h | 8 + arch/ia64/include/asm/paravirt_privop.h | 341 +++++++++++++++- arch/ia64/kernel/Makefile | 3 +- arch/ia64/kernel/paravirt.c | 520 +++++++++++++++++++++++- arch/ia64/kernel/paravirtentry.S | 43 +- arch/ia64/kernel/setup.c | 2 + 7 files changed, 898 insertions(+), 25 deletions(-) diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h index a3e44a5ed497..fbe2ad9234d0 100644 --- a/arch/ia64/include/asm/intrinsics.h +++ b/arch/ia64/include/asm/intrinsics.h @@ -201,7 +201,11 @@ extern long ia64_cmpxchg_called_with_bad_pointer (void); #ifndef __ASSEMBLY__ #if defined(CONFIG_PARAVIRT) && defined(__KERNEL__) -#define IA64_INTRINSIC_API(name) pv_cpu_ops.name +#ifdef ASM_SUPPORTED +# define IA64_INTRINSIC_API(name) paravirt_ ## name +#else +# define IA64_INTRINSIC_API(name) pv_cpu_ops.name +#endif #define IA64_INTRINSIC_MACRO(name) paravirt_ ## name #else #define IA64_INTRINSIC_API(name) ia64_native_ ## name diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index fc433f6c3275..2eb0a981a09a 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -118,6 +118,14 @@ struct pv_init_ops { int (*arch_setup_nomca)(void); void (*post_smp_prepare_boot_cpu)(void); + +#ifdef ASM_SUPPORTED + unsigned long (*patch_bundle)(void *sbundle, void *ebundle, + unsigned long type); + unsigned long (*patch_inst)(unsigned long stag, unsigned long etag, + unsigned long type); +#endif + void (*patch_branch)(unsigned long tag, unsigned long type); }; extern struct pv_init_ops pv_init_ops; diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h index 33c8e55f5775..76d6a6943e83 100644 --- a/arch/ia64/include/asm/paravirt_privop.h +++ b/arch/ia64/include/asm/paravirt_privop.h @@ -60,12 +60,18 @@ extern unsigned long ia64_native_getreg_func(int regnum); /* Instructions paravirtualized for performance */ /************************************************/ +#ifndef ASM_SUPPORTED +#define paravirt_ssm_i() pv_cpu_ops.ssm_i() +#define paravirt_rsm_i() pv_cpu_ops.rsm_i() +#define __paravirt_getreg() pv_cpu_ops.getreg() +#endif + /* mask for ia64_native_ssm/rsm() must be constant.("i" constraing). * static inline function doesn't satisfy it. */ #define paravirt_ssm(mask) \ do { \ if ((mask) == IA64_PSR_I) \ - pv_cpu_ops.ssm_i(); \ + paravirt_ssm_i(); \ else \ ia64_native_ssm(mask); \ } while (0) @@ -73,7 +79,7 @@ extern unsigned long ia64_native_getreg_func(int regnum); #define paravirt_rsm(mask) \ do { \ if ((mask) == IA64_PSR_I) \ - pv_cpu_ops.rsm_i(); \ + paravirt_rsm_i(); \ else \ ia64_native_rsm(mask); \ } while (0) @@ -86,7 +92,7 @@ extern unsigned long ia64_native_getreg_func(int regnum); if ((reg) == _IA64_REG_IP) \ res = ia64_native_getreg(_IA64_REG_IP); \ else \ - res = pv_cpu_ops.getreg(reg); \ + res = __paravirt_getreg(reg); \ res; \ }) @@ -121,4 +127,333 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); IA64_PARAVIRT_ASM_FUNC(work_processed_syscall) #define ia64_leave_kernel IA64_PARAVIRT_ASM_FUNC(leave_kernel) + +#if defined(CONFIG_PARAVIRT) +/****************************************************************************** + * binary patching infrastructure + */ +#define PARAVIRT_PATCH_TYPE_FC 1 +#define PARAVIRT_PATCH_TYPE_THASH 2 +#define PARAVIRT_PATCH_TYPE_GET_CPUID 3 +#define PARAVIRT_PATCH_TYPE_GET_PMD 4 +#define PARAVIRT_PATCH_TYPE_PTCGA 5 +#define PARAVIRT_PATCH_TYPE_GET_RR 6 +#define PARAVIRT_PATCH_TYPE_SET_RR 7 +#define PARAVIRT_PATCH_TYPE_SET_RR0_TO_RR4 8 +#define PARAVIRT_PATCH_TYPE_SSM_I 9 +#define PARAVIRT_PATCH_TYPE_RSM_I 10 +#define PARAVIRT_PATCH_TYPE_GET_PSR_I 11 +#define PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE 12 + +/* PARAVIRT_PATY_TYPE_[GS]ETREG + _IA64_REG_xxx */ +#define PARAVIRT_PATCH_TYPE_GETREG 0x10000000 +#define PARAVIRT_PATCH_TYPE_SETREG 0x20000000 + +/* + * struct task_struct* (*ia64_switch_to)(void* next_task); + * void *ia64_leave_syscall; + * void *ia64_work_processed_syscall + * void *ia64_leave_kernel; + */ + +#define PARAVIRT_PATCH_TYPE_BR_START 0x30000000 +#define PARAVIRT_PATCH_TYPE_BR_SWITCH_TO \ + (PARAVIRT_PATCH_TYPE_BR_START + 0) +#define PARAVIRT_PATCH_TYPE_BR_LEAVE_SYSCALL \ + (PARAVIRT_PATCH_TYPE_BR_START + 1) +#define PARAVIRT_PATCH_TYPE_BR_WORK_PROCESSED_SYSCALL \ + (PARAVIRT_PATCH_TYPE_BR_START + 2) +#define PARAVIRT_PATCH_TYPE_BR_LEAVE_KERNEL \ + (PARAVIRT_PATCH_TYPE_BR_START + 3) + +#ifdef ASM_SUPPORTED +#include + +/* + * pv_cpu_ops calling stub. + * normal function call convension can't be written by gcc + * inline assembly. + * + * from the caller's point of view, + * the following registers will be clobbered. + * r2, r3 + * r8-r15 + * r16, r17 + * b6, b7 + * p6-p15 + * ar.ccv + * + * from the callee's point of view , + * the following registers can be used. + * r2, r3: scratch + * r8: scratch, input argument0 and return value + * r0-r15: scratch, input argument1-5 + * b6: return pointer + * b7: scratch + * p6-p15: scratch + * ar.ccv: scratch + * + * other registers must not be changed. especially + * b0: rp: preserved. gcc ignores b0 in clobbered register. + * r16: saved gp + */ +/* 5 bundles */ +#define __PARAVIRT_BR \ + ";;\n" \ + "{ .mlx\n" \ + "nop 0\n" \ + "movl r2 = %[op_addr]\n"/* get function pointer address */ \ + ";;\n" \ + "}\n" \ + "1:\n" \ + "{ .mii\n" \ + "ld8 r2 = [r2]\n" /* load function descriptor address */ \ + "mov r17 = ip\n" /* get ip to calc return address */ \ + "mov r16 = gp\n" /* save gp */ \ + ";;\n" \ + "}\n" \ + "{ .mii\n" \ + "ld8 r3 = [r2], 8\n" /* load entry address */ \ + "adds r17 = 1f - 1b, r17\n" /* calculate return address */ \ + ";;\n" \ + "mov b7 = r3\n" /* set entry address */ \ + "}\n" \ + "{ .mib\n" \ + "ld8 gp = [r2]\n" /* load gp value */ \ + "mov b6 = r17\n" /* set return address */ \ + "br.cond.sptk.few b7\n" /* intrinsics are very short isns */ \ + "}\n" \ + "1:\n" \ + "{ .mii\n" \ + "mov gp = r16\n" /* restore gp value */ \ + "nop 0\n" \ + "nop 0\n" \ + ";;\n" \ + "}\n" + +#define PARAVIRT_OP(op) \ + [op_addr] "i"(&pv_cpu_ops.op) + +#define PARAVIRT_TYPE(type) \ + PARAVIRT_PATCH_TYPE_ ## type + +#define PARAVIRT_REG_CLOBBERS0 \ + "r2", "r3", /*"r8",*/ "r9", "r10", "r11", "r14", \ + "r15", "r16", "r17" + +#define PARAVIRT_REG_CLOBBERS1 \ + "r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14", \ + "r15", "r16", "r17" + +#define PARAVIRT_REG_CLOBBERS2 \ + "r2", "r3", /*"r8", "r9",*/ "r10", "r11", "r14", \ + "r15", "r16", "r17" + +#define PARAVIRT_REG_CLOBBERS5 \ + "r2", "r3", /*"r8", "r9", "r10", "r11", "r14",*/ \ + "r15", "r16", "r17" + +#define PARAVIRT_BR_CLOBBERS \ + "b6", "b7" + +#define PARAVIRT_PR_CLOBBERS \ + "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15" + +#define PARAVIRT_AR_CLOBBERS \ + "ar.ccv" + +#define PARAVIRT_CLOBBERS0 \ + PARAVIRT_REG_CLOBBERS0, \ + PARAVIRT_BR_CLOBBERS, \ + PARAVIRT_PR_CLOBBERS, \ + PARAVIRT_AR_CLOBBERS, \ + "memory" + +#define PARAVIRT_CLOBBERS1 \ + PARAVIRT_REG_CLOBBERS1, \ + PARAVIRT_BR_CLOBBERS, \ + PARAVIRT_PR_CLOBBERS, \ + PARAVIRT_AR_CLOBBERS, \ + "memory" + +#define PARAVIRT_CLOBBERS2 \ + PARAVIRT_REG_CLOBBERS2, \ + PARAVIRT_BR_CLOBBERS, \ + PARAVIRT_PR_CLOBBERS, \ + PARAVIRT_AR_CLOBBERS, \ + "memory" + +#define PARAVIRT_CLOBBERS5 \ + PARAVIRT_REG_CLOBBERS5, \ + PARAVIRT_BR_CLOBBERS, \ + PARAVIRT_PR_CLOBBERS, \ + PARAVIRT_AR_CLOBBERS, \ + "memory" + +#define PARAVIRT_BR0(op, type) \ + register unsigned long ia64_clobber asm ("r8"); \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(type)) \ + : "=r"(ia64_clobber) \ + : PARAVIRT_OP(op) \ + : PARAVIRT_CLOBBERS0) + +#define PARAVIRT_BR0_RET(op, type) \ + register unsigned long ia64_intri_res asm ("r8"); \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(type)) \ + : "=r"(ia64_intri_res) \ + : PARAVIRT_OP(op) \ + : PARAVIRT_CLOBBERS0) + +#define PARAVIRT_BR1(op, type, arg1) \ + register unsigned long __##arg1 asm ("r8") = arg1; \ + register unsigned long ia64_clobber asm ("r8"); \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(type)) \ + : "=r"(ia64_clobber) \ + : PARAVIRT_OP(op), "0"(__##arg1) \ + : PARAVIRT_CLOBBERS1) + +#define PARAVIRT_BR1_RET(op, type, arg1) \ + register unsigned long ia64_intri_res asm ("r8"); \ + register unsigned long __##arg1 asm ("r8") = arg1; \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(type)) \ + : "=r"(ia64_intri_res) \ + : PARAVIRT_OP(op), "0"(__##arg1) \ + : PARAVIRT_CLOBBERS1) + +#define PARAVIRT_BR2(op, type, arg1, arg2) \ + register unsigned long __##arg1 asm ("r8") = arg1; \ + register unsigned long __##arg2 asm ("r9") = arg2; \ + register unsigned long ia64_clobber1 asm ("r8"); \ + register unsigned long ia64_clobber2 asm ("r9"); \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(type)) \ + : "=r"(ia64_clobber1), "=r"(ia64_clobber2) \ + : PARAVIRT_OP(op), "0"(__##arg1), "1"(__##arg2) \ + : PARAVIRT_CLOBBERS2) + + +#define PARAVIRT_DEFINE_CPU_OP0(op, type) \ + static inline void \ + paravirt_ ## op (void) \ + { \ + PARAVIRT_BR0(op, type); \ + } + +#define PARAVIRT_DEFINE_CPU_OP0_RET(op, type) \ + static inline unsigned long \ + paravirt_ ## op (void) \ + { \ + PARAVIRT_BR0_RET(op, type); \ + return ia64_intri_res; \ + } + +#define PARAVIRT_DEFINE_CPU_OP1(op, type) \ + static inline void \ + paravirt_ ## op (unsigned long arg1) \ + { \ + PARAVIRT_BR1(op, type, arg1); \ + } + +#define PARAVIRT_DEFINE_CPU_OP1_RET(op, type) \ + static inline unsigned long \ + paravirt_ ## op (unsigned long arg1) \ + { \ + PARAVIRT_BR1_RET(op, type, arg1); \ + return ia64_intri_res; \ + } + +#define PARAVIRT_DEFINE_CPU_OP2(op, type) \ + static inline void \ + paravirt_ ## op (unsigned long arg1, \ + unsigned long arg2) \ + { \ + PARAVIRT_BR2(op, type, arg1, arg2); \ + } + + +PARAVIRT_DEFINE_CPU_OP1(fc, FC); +PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH) +PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID) +PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD) +PARAVIRT_DEFINE_CPU_OP2(ptcga, PTCGA) +PARAVIRT_DEFINE_CPU_OP1_RET(get_rr, GET_RR) +PARAVIRT_DEFINE_CPU_OP2(set_rr, SET_RR) +PARAVIRT_DEFINE_CPU_OP0(ssm_i, SSM_I) +PARAVIRT_DEFINE_CPU_OP0(rsm_i, RSM_I) +PARAVIRT_DEFINE_CPU_OP0_RET(get_psr_i, GET_PSR_I) +PARAVIRT_DEFINE_CPU_OP1(intrin_local_irq_restore, INTRIN_LOCAL_IRQ_RESTORE) + +static inline void +paravirt_set_rr0_to_rr4(unsigned long val0, unsigned long val1, + unsigned long val2, unsigned long val3, + unsigned long val4) +{ + register unsigned long __val0 asm ("r8") = val0; + register unsigned long __val1 asm ("r9") = val1; + register unsigned long __val2 asm ("r10") = val2; + register unsigned long __val3 asm ("r11") = val3; + register unsigned long __val4 asm ("r14") = val4; + + register unsigned long ia64_clobber0 asm ("r8"); + register unsigned long ia64_clobber1 asm ("r9"); + register unsigned long ia64_clobber2 asm ("r10"); + register unsigned long ia64_clobber3 asm ("r11"); + register unsigned long ia64_clobber4 asm ("r14"); + + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, + PARAVIRT_TYPE(SET_RR0_TO_RR4)) + : "=r"(ia64_clobber0), + "=r"(ia64_clobber1), + "=r"(ia64_clobber2), + "=r"(ia64_clobber3), + "=r"(ia64_clobber4) + : PARAVIRT_OP(set_rr0_to_rr4), + "0"(__val0), "1"(__val1), "2"(__val2), + "3"(__val3), "4"(__val4) + : PARAVIRT_CLOBBERS5); +} + +/* unsigned long paravirt_getreg(int reg) */ +#define __paravirt_getreg(reg) \ + ({ \ + register unsigned long ia64_intri_res asm ("r8"); \ + register unsigned long __reg asm ("r8") = (reg); \ + \ + BUILD_BUG_ON(!__builtin_constant_p(reg)); \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(GETREG) \ + + (reg)) \ + : "=r"(ia64_intri_res) \ + : PARAVIRT_OP(getreg), "0"(__reg) \ + : PARAVIRT_CLOBBERS1); \ + \ + ia64_intri_res; \ + }) + +/* void paravirt_setreg(int reg, unsigned long val) */ +#define paravirt_setreg(reg, val) \ + do { \ + register unsigned long __val asm ("r8") = val; \ + register unsigned long __reg asm ("r9") = reg; \ + register unsigned long ia64_clobber1 asm ("r8"); \ + register unsigned long ia64_clobber2 asm ("r9"); \ + \ + BUILD_BUG_ON(!__builtin_constant_p(reg)); \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(SETREG) \ + + (reg)) \ + : "=r"(ia64_clobber1), \ + "=r"(ia64_clobber2) \ + : PARAVIRT_OP(setreg), \ + "1"(__reg), "0"(__val) \ + : PARAVIRT_CLOBBERS2); \ + } while (0) + +#endif /* ASM_SUPPORTED */ +#endif /* CONFIG_PARAVIRT && ASM_SUPPOTED */ + #endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */ diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 8dc9df8a87a5..dbc19e4d5ef7 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -36,7 +36,8 @@ obj-$(CONFIG_PCI_MSI) += msi_ia64.o mca_recovery-y += mca_drv.o mca_drv_asm.o obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o \ + paravirt_patch.o obj-$(CONFIG_IA64_ESI) += esi.o ifneq ($(CONFIG_IA64_ESI),) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 6bc33a6db755..158d52414e96 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -46,13 +46,23 @@ struct pv_info pv_info = { * initialization hooks. */ -struct pv_init_ops pv_init_ops; +static void __init +ia64_native_patch_branch(unsigned long tag, unsigned long type); + +struct pv_init_ops pv_init_ops = +{ +#ifdef ASM_SUPPORTED + .patch_bundle = ia64_native_patch_bundle, +#endif + .patch_branch = ia64_native_patch_branch, +}; /*************************************************************************** * pv_cpu_ops * intrinsics hooks. */ +#ifndef ASM_SUPPORTED /* ia64_native_xxx are macros so that we have to make them real functions */ #define DEFINE_VOID_FUNC1(name) \ @@ -274,6 +284,261 @@ ia64_native_setreg_func(int regnum, unsigned long val) break; } } +#else + +#define __DEFINE_FUNC(name, code) \ + extern const char ia64_native_ ## name ## _direct_start[]; \ + extern const char ia64_native_ ## name ## _direct_end[]; \ + asm (".align 32\n" \ + ".proc ia64_native_" #name "_func\n" \ + "ia64_native_" #name "_func:\n" \ + "ia64_native_" #name "_direct_start:\n" \ + code \ + "ia64_native_" #name "_direct_end:\n" \ + "br.cond.sptk.many b6\n" \ + ".endp ia64_native_" #name "_func\n") + +#define DEFINE_VOID_FUNC0(name, code) \ + extern void \ + ia64_native_ ## name ## _func(void); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_VOID_FUNC1(name, code) \ + extern void \ + ia64_native_ ## name ## _func(unsigned long arg); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_VOID_FUNC2(name, code) \ + extern void \ + ia64_native_ ## name ## _func(unsigned long arg0, \ + unsigned long arg1); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_FUNC0(name, code) \ + extern unsigned long \ + ia64_native_ ## name ## _func(void); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_FUNC1(name, type, code) \ + extern unsigned long \ + ia64_native_ ## name ## _func(type arg); \ + __DEFINE_FUNC(name, code) + +DEFINE_VOID_FUNC1(fc, + "fc r8\n"); +DEFINE_VOID_FUNC1(intrin_local_irq_restore, + ";;\n" + " cmp.ne p6, p7 = r8, r0\n" + ";;\n" + "(p6) ssm psr.i\n" + "(p7) rsm psr.i\n" + ";;\n" + "(p6) srlz.d\n"); + +DEFINE_VOID_FUNC2(ptcga, + "ptc.ga r8, r9\n"); +DEFINE_VOID_FUNC2(set_rr, + "mov rr[r8] = r9\n"); + +/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */ +DEFINE_FUNC0(get_psr_i, + "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n" + "mov r8 = psr\n" + ";;\n" + "and r8 = r2, r8\n"); + +DEFINE_FUNC1(thash, unsigned long, + "thash r8 = r8\n"); +DEFINE_FUNC1(get_cpuid, int, + "mov r8 = cpuid[r8]\n"); +DEFINE_FUNC1(get_pmd, int, + "mov r8 = pmd[r8]\n"); +DEFINE_FUNC1(get_rr, unsigned long, + "mov r8 = rr[r8]\n"); + +DEFINE_VOID_FUNC0(ssm_i, + "ssm psr.i\n"); +DEFINE_VOID_FUNC0(rsm_i, + "rsm psr.i\n"); + +extern void +ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1, + unsigned long val2, unsigned long val3, + unsigned long val4); +__DEFINE_FUNC(set_rr0_to_rr4, + "mov rr[r0] = r8\n" + "movl r2 = 0x2000000000000000\n" + ";;\n" + "mov rr[r2] = r9\n" + "shl r3 = r2, 1\n" /* movl r3 = 0x4000000000000000 */ + ";;\n" + "add r2 = r2, r3\n" /* movl r2 = 0x6000000000000000 */ + "mov rr[r3] = r10\n" + ";;\n" + "mov rr[r2] = r11\n" + "shl r3 = r3, 1\n" /* movl r3 = 0x8000000000000000 */ + ";;\n" + "mov rr[r3] = r14\n"); + +extern unsigned long ia64_native_getreg_func(int regnum); +asm(".global ia64_native_getreg_func\n"); +#define __DEFINE_GET_REG(id, reg) \ + "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ + ";;\n" \ + "cmp.eq p6, p0 = r2, r8\n" \ + ";;\n" \ + "(p6) mov r8 = " #reg "\n" \ + "(p6) br.cond.sptk.many b6\n" \ + ";;\n" +#define __DEFINE_GET_AR(id, reg) __DEFINE_GET_REG(AR_ ## id, ar.reg) +#define __DEFINE_GET_CR(id, reg) __DEFINE_GET_REG(CR_ ## id, cr.reg) + +__DEFINE_FUNC(getreg, + __DEFINE_GET_REG(GP, gp) + /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */ + __DEFINE_GET_REG(PSR, psr) + __DEFINE_GET_REG(TP, tp) + __DEFINE_GET_REG(SP, sp) + + __DEFINE_GET_REG(AR_KR0, ar0) + __DEFINE_GET_REG(AR_KR1, ar1) + __DEFINE_GET_REG(AR_KR2, ar2) + __DEFINE_GET_REG(AR_KR3, ar3) + __DEFINE_GET_REG(AR_KR4, ar4) + __DEFINE_GET_REG(AR_KR5, ar5) + __DEFINE_GET_REG(AR_KR6, ar6) + __DEFINE_GET_REG(AR_KR7, ar7) + __DEFINE_GET_AR(RSC, rsc) + __DEFINE_GET_AR(BSP, bsp) + __DEFINE_GET_AR(BSPSTORE, bspstore) + __DEFINE_GET_AR(RNAT, rnat) + __DEFINE_GET_AR(FCR, fcr) + __DEFINE_GET_AR(EFLAG, eflag) + __DEFINE_GET_AR(CSD, csd) + __DEFINE_GET_AR(SSD, ssd) + __DEFINE_GET_REG(AR_CFLAG, ar27) + __DEFINE_GET_AR(FSR, fsr) + __DEFINE_GET_AR(FIR, fir) + __DEFINE_GET_AR(FDR, fdr) + __DEFINE_GET_AR(CCV, ccv) + __DEFINE_GET_AR(UNAT, unat) + __DEFINE_GET_AR(FPSR, fpsr) + __DEFINE_GET_AR(ITC, itc) + __DEFINE_GET_AR(PFS, pfs) + __DEFINE_GET_AR(LC, lc) + __DEFINE_GET_AR(EC, ec) + + __DEFINE_GET_CR(DCR, dcr) + __DEFINE_GET_CR(ITM, itm) + __DEFINE_GET_CR(IVA, iva) + __DEFINE_GET_CR(PTA, pta) + __DEFINE_GET_CR(IPSR, ipsr) + __DEFINE_GET_CR(ISR, isr) + __DEFINE_GET_CR(IIP, iip) + __DEFINE_GET_CR(IFA, ifa) + __DEFINE_GET_CR(ITIR, itir) + __DEFINE_GET_CR(IIPA, iipa) + __DEFINE_GET_CR(IFS, ifs) + __DEFINE_GET_CR(IIM, iim) + __DEFINE_GET_CR(IHA, iha) + __DEFINE_GET_CR(LID, lid) + __DEFINE_GET_CR(IVR, ivr) + __DEFINE_GET_CR(TPR, tpr) + __DEFINE_GET_CR(EOI, eoi) + __DEFINE_GET_CR(IRR0, irr0) + __DEFINE_GET_CR(IRR1, irr1) + __DEFINE_GET_CR(IRR2, irr2) + __DEFINE_GET_CR(IRR3, irr3) + __DEFINE_GET_CR(ITV, itv) + __DEFINE_GET_CR(PMV, pmv) + __DEFINE_GET_CR(CMCV, cmcv) + __DEFINE_GET_CR(LRR0, lrr0) + __DEFINE_GET_CR(LRR1, lrr1) + + "mov r8 = -1\n" /* unsupported case */ + ); + +extern void ia64_native_setreg_func(int regnum, unsigned long val); +asm(".global ia64_native_setreg_func\n"); +#define __DEFINE_SET_REG(id, reg) \ + "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ + ";;\n" \ + "cmp.eq p6, p0 = r2, r9\n" \ + ";;\n" \ + "(p6) mov " #reg " = r8\n" \ + "(p6) br.cond.sptk.many b6\n" \ + ";;\n" +#define __DEFINE_SET_AR(id, reg) __DEFINE_SET_REG(AR_ ## id, ar.reg) +#define __DEFINE_SET_CR(id, reg) __DEFINE_SET_REG(CR_ ## id, cr.reg) +__DEFINE_FUNC(setreg, + "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n" + ";;\n" + "cmp.eq p6, p0 = r2, r9\n" + ";;\n" + "(p6) mov psr.l = r8\n" +#ifdef HAVE_SERIALIZE_DIRECTIVE + ".serialize.data\n" +#endif + "(p6) br.cond.sptk.many b6\n" + __DEFINE_SET_REG(GP, gp) + __DEFINE_SET_REG(SP, sp) + + __DEFINE_SET_REG(AR_KR0, ar0) + __DEFINE_SET_REG(AR_KR1, ar1) + __DEFINE_SET_REG(AR_KR2, ar2) + __DEFINE_SET_REG(AR_KR3, ar3) + __DEFINE_SET_REG(AR_KR4, ar4) + __DEFINE_SET_REG(AR_KR5, ar5) + __DEFINE_SET_REG(AR_KR6, ar6) + __DEFINE_SET_REG(AR_KR7, ar7) + __DEFINE_SET_AR(RSC, rsc) + __DEFINE_SET_AR(BSP, bsp) + __DEFINE_SET_AR(BSPSTORE, bspstore) + __DEFINE_SET_AR(RNAT, rnat) + __DEFINE_SET_AR(FCR, fcr) + __DEFINE_SET_AR(EFLAG, eflag) + __DEFINE_SET_AR(CSD, csd) + __DEFINE_SET_AR(SSD, ssd) + __DEFINE_SET_REG(AR_CFLAG, ar27) + __DEFINE_SET_AR(FSR, fsr) + __DEFINE_SET_AR(FIR, fir) + __DEFINE_SET_AR(FDR, fdr) + __DEFINE_SET_AR(CCV, ccv) + __DEFINE_SET_AR(UNAT, unat) + __DEFINE_SET_AR(FPSR, fpsr) + __DEFINE_SET_AR(ITC, itc) + __DEFINE_SET_AR(PFS, pfs) + __DEFINE_SET_AR(LC, lc) + __DEFINE_SET_AR(EC, ec) + + __DEFINE_SET_CR(DCR, dcr) + __DEFINE_SET_CR(ITM, itm) + __DEFINE_SET_CR(IVA, iva) + __DEFINE_SET_CR(PTA, pta) + __DEFINE_SET_CR(IPSR, ipsr) + __DEFINE_SET_CR(ISR, isr) + __DEFINE_SET_CR(IIP, iip) + __DEFINE_SET_CR(IFA, ifa) + __DEFINE_SET_CR(ITIR, itir) + __DEFINE_SET_CR(IIPA, iipa) + __DEFINE_SET_CR(IFS, ifs) + __DEFINE_SET_CR(IIM, iim) + __DEFINE_SET_CR(IHA, iha) + __DEFINE_SET_CR(LID, lid) + __DEFINE_SET_CR(IVR, ivr) + __DEFINE_SET_CR(TPR, tpr) + __DEFINE_SET_CR(EOI, eoi) + __DEFINE_SET_CR(IRR0, irr0) + __DEFINE_SET_CR(IRR1, irr1) + __DEFINE_SET_CR(IRR2, irr2) + __DEFINE_SET_CR(IRR3, irr3) + __DEFINE_SET_CR(ITV, itv) + __DEFINE_SET_CR(PMV, pmv) + __DEFINE_SET_CR(CMCV, cmcv) + __DEFINE_SET_CR(LRR0, lrr0) + __DEFINE_SET_CR(LRR1, lrr1) + ); +#endif struct pv_cpu_ops pv_cpu_ops = { .fc = ia64_native_fc_func, @@ -368,3 +633,256 @@ struct pv_time_ops pv_time_ops = { .do_steal_accounting = ia64_native_do_steal_accounting, .sched_clock = ia64_native_sched_clock, }; + +/*************************************************************************** + * binary pacthing + * pv_init_ops.patch_bundle + */ + +#ifdef ASM_SUPPORTED +#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg) \ + __DEFINE_FUNC(get_ ## name, \ + ";;\n" \ + "mov r8 = " #reg "\n" \ + ";;\n") + +#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \ + __DEFINE_FUNC(set_ ## name, \ + ";;\n" \ + "mov " #reg " = r8\n" \ + ";;\n") + +#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg) \ + IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg); \ + IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \ + +#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg) \ + IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg) + +#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg) \ + IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg) + + +IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr); +IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp); + +/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */ +__DEFINE_FUNC(set_psr_l, + ";;\n" + "mov psr.l = r8\n" +#ifdef HAVE_SERIALIZE_DIRECTIVE + ".serialize.data\n" +#endif + ";;\n"); + +IA64_NATIVE_PATCH_DEFINE_REG(gp, gp); +IA64_NATIVE_PATCH_DEFINE_REG(sp, sp); + +IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0); +IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1); +IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2); +IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3); +IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4); +IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5); +IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6); +IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7); + +IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc); +IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp); +IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore); +IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat); +IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr); +IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag); +IA64_NATIVE_PATCH_DEFINE_AR(csd, csd); +IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd); +IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27); +IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr); +IA64_NATIVE_PATCH_DEFINE_AR(fir, fir); +IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr); +IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv); +IA64_NATIVE_PATCH_DEFINE_AR(unat, unat); +IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr); +IA64_NATIVE_PATCH_DEFINE_AR(itc, itc); +IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs); +IA64_NATIVE_PATCH_DEFINE_AR(lc, lc); +IA64_NATIVE_PATCH_DEFINE_AR(ec, ec); + +IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr); +IA64_NATIVE_PATCH_DEFINE_CR(itm, itm); +IA64_NATIVE_PATCH_DEFINE_CR(iva, iva); +IA64_NATIVE_PATCH_DEFINE_CR(pta, pta); +IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr); +IA64_NATIVE_PATCH_DEFINE_CR(isr, isr); +IA64_NATIVE_PATCH_DEFINE_CR(iip, iip); +IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa); +IA64_NATIVE_PATCH_DEFINE_CR(itir, itir); +IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa); +IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs); +IA64_NATIVE_PATCH_DEFINE_CR(iim, iim); +IA64_NATIVE_PATCH_DEFINE_CR(iha, iha); +IA64_NATIVE_PATCH_DEFINE_CR(lid, lid); +IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr); +IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr); +IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi); +IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0); +IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1); +IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2); +IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3); +IA64_NATIVE_PATCH_DEFINE_CR(itv, itv); +IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv); +IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv); +IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0); +IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1); + +static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[] +__initdata_or_module = +{ +#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type) \ + { \ + (void*)ia64_native_ ## name ## _direct_start, \ + (void*)ia64_native_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_ ## type, \ + } + + IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC), + IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD), + IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR), + IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR), + IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4), + IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I), + IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I), + IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore, + INTRIN_LOCAL_IRQ_RESTORE), + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg) \ + { \ + (void*)ia64_native_get_ ## name ## _direct_start, \ + (void*)ia64_native_get_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \ + } + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ + { \ + (void*)ia64_native_set_ ## name ## _direct_start, \ + (void*)ia64_native_set_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \ + } + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg) \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg), \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg) \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg) + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg) \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg) + + IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1), +}; + +unsigned long __init_or_module +ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type) +{ + const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) / + sizeof(ia64_native_patch_bundle_elems[0]); + + return __paravirt_patch_apply_bundle(sbundle, ebundle, type, + ia64_native_patch_bundle_elems, + nelems, NULL); +} +#endif /* ASM_SUPPOTED */ + +extern const char ia64_native_switch_to[]; +extern const char ia64_native_leave_syscall[]; +extern const char ia64_native_work_processed_syscall[]; +extern const char ia64_native_leave_kernel[]; + +const struct paravirt_patch_branch_target ia64_native_branch_target[] +__initconst = { +#define PARAVIRT_BR_TARGET(name, type) \ + { \ + ia64_native_ ## name, \ + PARAVIRT_PATCH_TYPE_BR_ ## type, \ + } + PARAVIRT_BR_TARGET(switch_to, SWITCH_TO), + PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL), + PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL), + PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL), +}; + +static void __init +ia64_native_patch_branch(unsigned long tag, unsigned long type) +{ + const unsigned long nelem = + sizeof(ia64_native_branch_target) / + sizeof(ia64_native_branch_target[0]); + __paravirt_patch_apply_branch(tag, type, + ia64_native_branch_target, nelem); +} diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S index 80c0d365cbc0..6158560d7f17 100644 --- a/arch/ia64/kernel/paravirtentry.S +++ b/arch/ia64/kernel/paravirtentry.S @@ -20,8 +20,11 @@ * */ +#include #include #include +#include +#include #include "entry.h" #define DATA8(sym, init_value) \ @@ -32,32 +35,34 @@ data8 init_value ; \ .popsection -#define BRANCH(targ, reg, breg) \ - movl reg=targ ; \ - ;; \ - ld8 reg=[reg] ; \ - ;; \ - mov breg=reg ; \ +#define BRANCH(targ, reg, breg, type) \ + PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ; \ + ;; \ + movl reg=targ ; \ + ;; \ + ld8 reg=[reg] ; \ + ;; \ + mov breg=reg ; \ br.cond.sptk.many breg -#define BRANCH_PROC(sym, reg, breg) \ - DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ - GLOBAL_ENTRY(paravirt_ ## sym) ; \ - BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \ +#define BRANCH_PROC(sym, reg, breg, type) \ + DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ + GLOBAL_ENTRY(paravirt_ ## sym) ; \ + BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \ END(paravirt_ ## sym) -#define BRANCH_PROC_UNWINFO(sym, reg, breg) \ - DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ - GLOBAL_ENTRY(paravirt_ ## sym) ; \ - PT_REGS_UNWIND_INFO(0) ; \ - BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \ +#define BRANCH_PROC_UNWINFO(sym, reg, breg, type) \ + DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ + GLOBAL_ENTRY(paravirt_ ## sym) ; \ + PT_REGS_UNWIND_INFO(0) ; \ + BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \ END(paravirt_ ## sym) -BRANCH_PROC(switch_to, r22, b7) -BRANCH_PROC_UNWINFO(leave_syscall, r22, b7) -BRANCH_PROC(work_processed_syscall, r2, b7) -BRANCH_PROC_UNWINFO(leave_kernel, r22, b7) +BRANCH_PROC(switch_to, r22, b7, SWITCH_TO) +BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL) +BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL) +BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL) #ifdef CONFIG_MODULES diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 865af27c7737..4ed3e1c117e7 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -537,6 +538,7 @@ setup_arch (char **cmdline_p) paravirt_arch_setup_early(); ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); + paravirt_patch_apply(); *cmdline_p = __va(ia64_boot_param->command_line); strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); From ee158fcd095c8233c9b578fbbe8a5897979a52a9 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:06:53 +0900 Subject: [PATCH 181/478] ia64/pv_ops/bp/module: support binary patching for kernel module. support binary patching for kernel module. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/module.h | 6 ++++++ arch/ia64/kernel/module.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h index d2da61e4c49b..908eaef42a08 100644 --- a/arch/ia64/include/asm/module.h +++ b/arch/ia64/include/asm/module.h @@ -16,6 +16,12 @@ struct mod_arch_specific { struct elf64_shdr *got; /* global offset table */ struct elf64_shdr *opd; /* official procedure descriptors */ struct elf64_shdr *unwind; /* unwind-table section */ +#ifdef CONFIG_PARAVIRT + struct elf64_shdr *paravirt_bundles; + /* paravirt_alt_bundle_patch table */ + struct elf64_shdr *paravirt_insts; + /* paravirt_alt_inst_patch table */ +#endif unsigned long gp; /* global-pointer for module */ void *core_unw_table; /* core unwind-table cookie returned by unwinder */ diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index aaa7d901521f..34fe4259a144 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -446,6 +446,14 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, mod->arch.opd = s; else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0) mod->arch.unwind = s; +#ifdef CONFIG_PARAVIRT + else if (strcmp(".paravirt_bundles", + secstrings + s->sh_name) == 0) + mod->arch.paravirt_bundles = s; + else if (strcmp(".paravirt_insts", + secstrings + s->sh_name) == 0) + mod->arch.paravirt_insts = s; +#endif if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) { printk(KERN_ERR "%s: sections missing\n", mod->name); @@ -921,6 +929,30 @@ module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mo DEBUGP("%s: init: entry=%p\n", __func__, mod->init); if (mod->arch.unwind) register_unwind_table(mod); +#ifdef CONFIG_PARAVIRT + if (mod->arch.paravirt_bundles) { + struct paravirt_patch_site_bundle *start = + (struct paravirt_patch_site_bundle *) + mod->arch.paravirt_bundles->sh_addr; + struct paravirt_patch_site_bundle *end = + (struct paravirt_patch_site_bundle *) + (mod->arch.paravirt_bundles->sh_addr + + mod->arch.paravirt_bundles->sh_size); + + paravirt_patch_apply_bundle(start, end); + } + if (mod->arch.paravirt_insts) { + struct paravirt_patch_site_inst *start = + (struct paravirt_patch_site_inst *) + mod->arch.paravirt_insts->sh_addr; + struct paravirt_patch_site_inst *end = + (struct paravirt_patch_site_inst *) + (mod->arch.paravirt_insts->sh_addr + + mod->arch.paravirt_insts->sh_size); + + paravirt_patch_apply_inst(start, end); + } +#endif return 0; } From dae17da60d1797c9049d21d06de0db1873eee153 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:06:54 +0900 Subject: [PATCH 182/478] ia64/pv_ops/binary patch: define paravirt_dv_serialize_data() and suppress false positive warning. define paravirt_dv_serialize_data() and insert it to suppress false positive warnings. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/paravirt_privop.h | 6 ++++++ arch/ia64/kernel/efi.c | 1 + arch/ia64/kvm/vtlb.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h index 76d6a6943e83..4e40e62c4ab8 100644 --- a/arch/ia64/include/asm/paravirt_privop.h +++ b/arch/ia64/include/asm/paravirt_privop.h @@ -118,6 +118,12 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); #endif /* CONFIG_PARAVIRT */ +#if defined(CONFIG_PARAVIRT) && defined(ASM_SUPPORTED) +#define paravirt_dv_serialize_data() ia64_dv_serialize_data() +#else +#define paravirt_dv_serialize_data() /* nothing */ +#endif + /* these routines utilize privilege-sensitive or performance-sensitive * privileged instructions so the code must be replaced with * paravirtualized versions */ diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index efaff15d8cf1..7ef80e8161ce 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -456,6 +456,7 @@ efi_map_pal_code (void) GRANULEROUNDDOWN((unsigned long) pal_vaddr), pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)), IA64_GRANULE_SHIFT); + paravirt_dv_serialize_data(); ia64_set_psr(psr); /* restore psr */ } diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c index 6b6307a3bd55..1de4dbda37e7 100644 --- a/arch/ia64/kvm/vtlb.c +++ b/arch/ia64/kvm/vtlb.c @@ -210,6 +210,7 @@ void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va, int type) phy_pte &= ~PAGE_FLAGS_RV_MASK; psr = ia64_clear_ic(); ia64_itc(type, va, phy_pte, itir_ps(itir)); + paravirt_dv_serialize_data(); ia64_set_psr(psr); } @@ -464,6 +465,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, phy_pte &= ~PAGE_FLAGS_RV_MASK; psr = ia64_clear_ic(); ia64_itc(type, ifa, phy_pte, ps); + paravirt_dv_serialize_data(); ia64_set_psr(psr); } if (!(pte&VTLB_PTE_IO)) From 0a7d32440294faea84c9aae4cb99239fe6ddb8ed Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 4 Mar 2009 21:06:55 +0900 Subject: [PATCH 183/478] ia64/pv_ops/bp/xen: implemented binary patchable pv_cpu_ops. implemented xen binary patch for pv_cpu_ops. Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/xen/privop.h | 4 + arch/ia64/xen/hypercall.S | 2 + arch/ia64/xen/xen_pv_ops.c | 665 +++++++++++++++++++++++++++++ 3 files changed, 671 insertions(+) diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h index 2261dda578ff..e5fbaeeb161a 100644 --- a/arch/ia64/include/asm/xen/privop.h +++ b/arch/ia64/include/asm/xen/privop.h @@ -82,8 +82,10 @@ extern unsigned long xen_thash(unsigned long addr); extern unsigned long xen_get_cpuid(int index); extern unsigned long xen_get_pmd(int index); +#ifndef ASM_SUPPORTED extern unsigned long xen_get_eflag(void); /* see xen_ia64_getreg */ extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */ +#endif /************************************************/ /* Instructions paravirtualized for performance */ @@ -108,6 +110,7 @@ extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */ #define xen_get_virtual_pend() \ (*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1)) +#ifndef ASM_SUPPORTED /* Although all privileged operations can be left to trap and will * be properly handled by Xen, some are frequent enough that we use * hyperprivops for performance. */ @@ -125,6 +128,7 @@ extern void xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1, unsigned long val4); extern void xen_set_kr(unsigned long index, unsigned long val); extern void xen_ptcga(unsigned long addr, unsigned long size); +#endif /* !ASM_SUPPORTED */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/ia64/xen/hypercall.S b/arch/ia64/xen/hypercall.S index 45e02bb64a92..e32dae444dd6 100644 --- a/arch/ia64/xen/hypercall.S +++ b/arch/ia64/xen/hypercall.S @@ -9,6 +9,7 @@ #include #include +#ifdef __INTEL_COMPILER /* * Hypercalls without parameter. */ @@ -72,6 +73,7 @@ GLOBAL_ENTRY(xen_set_rr0_to_rr4) br.ret.sptk.many rp ;; END(xen_set_rr0_to_rr4) +#endif GLOBAL_ENTRY(xen_send_ipi) mov r14=r32 diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c index bdf1acbce81c..6c44225e7b84 100644 --- a/arch/ia64/xen/xen_pv_ops.c +++ b/arch/ia64/xen/xen_pv_ops.c @@ -154,6 +154,13 @@ xen_post_smp_prepare_boot_cpu(void) xen_setup_vcpu_info_placement(); } +#ifdef ASM_SUPPORTED +static unsigned long __init_or_module +xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type); +#endif +static void __init +xen_patch_branch(unsigned long tag, unsigned long type); + static const struct pv_init_ops xen_init_ops __initconst = { .banner = xen_banner, @@ -164,6 +171,10 @@ static const struct pv_init_ops xen_init_ops __initconst = { .arch_setup_nomca = xen_arch_setup_nomca, .post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu, +#ifdef ASM_SUPPORTED + .patch_bundle = xen_patch_bundle, +#endif + .patch_branch = xen_patch_branch, }; /*************************************************************************** @@ -214,6 +225,7 @@ static struct pv_patchdata xen_patchdata __initdata = { * intrinsics hooks. */ +#ifndef ASM_SUPPORTED static void xen_set_itm_with_offset(unsigned long val) { @@ -381,6 +393,410 @@ xen_intrin_local_irq_restore(unsigned long mask) else xen_rsm_i(); } +#else +#define __DEFINE_FUNC(name, code) \ + extern const char xen_ ## name ## _direct_start[]; \ + extern const char xen_ ## name ## _direct_end[]; \ + asm (".align 32\n" \ + ".proc xen_" #name "\n" \ + "xen_" #name ":\n" \ + "xen_" #name "_direct_start:\n" \ + code \ + "xen_" #name "_direct_end:\n" \ + "br.cond.sptk.many b6\n" \ + ".endp xen_" #name "\n") + +#define DEFINE_VOID_FUNC0(name, code) \ + extern void \ + xen_ ## name (void); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_VOID_FUNC1(name, code) \ + extern void \ + xen_ ## name (unsigned long arg); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_VOID_FUNC2(name, code) \ + extern void \ + xen_ ## name (unsigned long arg0, \ + unsigned long arg1); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_FUNC0(name, code) \ + extern unsigned long \ + xen_ ## name (void); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_FUNC1(name, type, code) \ + extern unsigned long \ + xen_ ## name (type arg); \ + __DEFINE_FUNC(name, code) + +#define XEN_PSR_I_ADDR_ADDR (XSI_BASE + XSI_PSR_I_ADDR_OFS) + +/* + * static void xen_set_itm_with_offset(unsigned long val) + * xen_set_itm(val - XEN_MAPPEDREGS->itc_offset); + */ +/* 2 bundles */ +DEFINE_VOID_FUNC1(set_itm_with_offset, + "mov r2 = " __stringify(XSI_BASE) " + " + __stringify(XSI_ITC_OFFSET_OFS) "\n" + ";;\n" + "ld8 r3 = [r2]\n" + ";;\n" + "sub r8 = r8, r3\n" + "break " __stringify(HYPERPRIVOP_SET_ITM) "\n"); + +/* + * static unsigned long xen_get_itm_with_offset(void) + * return ia64_native_getreg(_IA64_REG_CR_ITM) + XEN_MAPPEDREGS->itc_offset; + */ +/* 2 bundles */ +DEFINE_FUNC0(get_itm_with_offset, + "mov r2 = " __stringify(XSI_BASE) " + " + __stringify(XSI_ITC_OFFSET_OFS) "\n" + ";;\n" + "ld8 r3 = [r2]\n" + "mov r8 = cr.itm\n" + ";;\n" + "add r8 = r8, r2\n"); + +/* + * static void xen_set_itc(unsigned long val) + * unsigned long mitc; + * + * WARN_ON(!irqs_disabled()); + * mitc = ia64_native_getreg(_IA64_REG_AR_ITC); + * XEN_MAPPEDREGS->itc_offset = val - mitc; + * XEN_MAPPEDREGS->itc_last = val; + */ +/* 2 bundles */ +DEFINE_VOID_FUNC1(set_itc, + "mov r2 = " __stringify(XSI_BASE) " + " + __stringify(XSI_ITC_LAST_OFS) "\n" + "mov r3 = ar.itc\n" + ";;\n" + "sub r3 = r8, r3\n" + "st8 [r2] = r8, " + __stringify(XSI_ITC_LAST_OFS) " - " + __stringify(XSI_ITC_OFFSET_OFS) "\n" + ";;\n" + "st8 [r2] = r3\n"); + +/* + * static unsigned long xen_get_itc(void) + * unsigned long res; + * unsigned long itc_offset; + * unsigned long itc_last; + * unsigned long ret_itc_last; + * + * itc_offset = XEN_MAPPEDREGS->itc_offset; + * do { + * itc_last = XEN_MAPPEDREGS->itc_last; + * res = ia64_native_getreg(_IA64_REG_AR_ITC); + * res += itc_offset; + * if (itc_last >= res) + * res = itc_last + 1; + * ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last, + * itc_last, res); + * } while (unlikely(ret_itc_last != itc_last)); + * return res; + */ +/* 5 bundles */ +DEFINE_FUNC0(get_itc, + "mov r2 = " __stringify(XSI_BASE) " + " + __stringify(XSI_ITC_OFFSET_OFS) "\n" + ";;\n" + "ld8 r9 = [r2], " __stringify(XSI_ITC_LAST_OFS) " - " + __stringify(XSI_ITC_OFFSET_OFS) "\n" + /* r9 = itc_offset */ + /* r2 = XSI_ITC_OFFSET */ + "888:\n" + "mov r8 = ar.itc\n" /* res = ar.itc */ + ";;\n" + "ld8 r3 = [r2]\n" /* r3 = itc_last */ + "add r8 = r8, r9\n" /* res = ar.itc + itc_offset */ + ";;\n" + "cmp.gtu p6, p0 = r3, r8\n" + ";;\n" + "(p6) add r8 = 1, r3\n" /* if (itc_last > res) itc_last + 1 */ + ";;\n" + "mov ar.ccv = r8\n" + ";;\n" + "cmpxchg8.acq r10 = [r2], r8, ar.ccv\n" + ";;\n" + "cmp.ne p6, p0 = r10, r3\n" + "(p6) hint @pause\n" + "(p6) br.cond.spnt 888b\n"); + +DEFINE_VOID_FUNC1(fc, + "break " __stringify(HYPERPRIVOP_FC) "\n"); + +/* + * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR + * masked_addr = *psr_i_addr_addr + * pending_intr_addr = masked_addr - 1 + * if (val & IA64_PSR_I) { + * masked = *masked_addr + * *masked_addr = 0:xen_set_virtual_psr_i(1) + * compiler barrier + * if (masked) { + * uint8_t pending = *pending_intr_addr; + * if (pending) + * XEN_HYPER_SSM_I + * } + * } else { + * *masked_addr = 1:xen_set_virtual_psr_i(0) + * } + */ +/* 6 bundles */ +DEFINE_VOID_FUNC1(intrin_local_irq_restore, + /* r8 = input value: 0 or IA64_PSR_I + * p6 = (flags & IA64_PSR_I) + * = if clause + * p7 = !(flags & IA64_PSR_I) + * = else clause + */ + "cmp.ne p6, p7 = r8, r0\n" + "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" + ";;\n" + /* r9 = XEN_PSR_I_ADDR */ + "ld8 r9 = [r9]\n" + ";;\n" + + /* r10 = masked previous value */ + "(p6) ld1.acq r10 = [r9]\n" + ";;\n" + + /* p8 = !masked interrupt masked previously? */ + "(p6) cmp.ne.unc p8, p0 = r10, r0\n" + + /* p7 = else clause */ + "(p7) mov r11 = 1\n" + ";;\n" + /* masked = 1 */ + "(p7) st1.rel [r9] = r11\n" + + /* p6 = if clause */ + /* masked = 0 + * r9 = masked_addr - 1 + * = pending_intr_addr + */ + "(p8) st1.rel [r9] = r0, -1\n" + ";;\n" + /* r8 = pending_intr */ + "(p8) ld1.acq r11 = [r9]\n" + ";;\n" + /* p9 = interrupt pending? */ + "(p8) cmp.ne.unc p9, p10 = r11, r0\n" + ";;\n" + "(p10) mf\n" + /* issue hypercall to trigger interrupt */ + "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n"); + +DEFINE_VOID_FUNC2(ptcga, + "break " __stringify(HYPERPRIVOP_PTC_GA) "\n"); +DEFINE_VOID_FUNC2(set_rr, + "break " __stringify(HYPERPRIVOP_SET_RR) "\n"); + +/* + * tmp = XEN_MAPPEDREGS->interrupt_mask_addr = XEN_PSR_I_ADDR_ADDR; + * tmp = *tmp + * tmp = *tmp; + * psr_i = tmp? 0: IA64_PSR_I; + */ +/* 4 bundles */ +DEFINE_FUNC0(get_psr_i, + "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" + ";;\n" + "ld8 r9 = [r9]\n" /* r9 = XEN_PSR_I_ADDR */ + "mov r8 = 0\n" /* psr_i = 0 */ + ";;\n" + "ld1.acq r9 = [r9]\n" /* r9 = XEN_PSR_I */ + ";;\n" + "cmp.eq.unc p6, p0 = r9, r0\n" /* p6 = (XEN_PSR_I != 0) */ + ";;\n" + "(p6) mov r8 = " __stringify(1 << IA64_PSR_I_BIT) "\n"); + +DEFINE_FUNC1(thash, unsigned long, + "break " __stringify(HYPERPRIVOP_THASH) "\n"); +DEFINE_FUNC1(get_cpuid, int, + "break " __stringify(HYPERPRIVOP_GET_CPUID) "\n"); +DEFINE_FUNC1(get_pmd, int, + "break " __stringify(HYPERPRIVOP_GET_PMD) "\n"); +DEFINE_FUNC1(get_rr, unsigned long, + "break " __stringify(HYPERPRIVOP_GET_RR) "\n"); + +/* + * void xen_privop_ssm_i(void) + * + * int masked = !xen_get_virtual_psr_i(); + * // masked = *(*XEN_MAPPEDREGS->interrupt_mask_addr) + * xen_set_virtual_psr_i(1) + * // *(*XEN_MAPPEDREGS->interrupt_mask_addr) = 0 + * // compiler barrier + * if (masked) { + * uint8_t* pend_int_addr = + * (uint8_t*)(*XEN_MAPPEDREGS->interrupt_mask_addr) - 1; + * uint8_t pending = *pend_int_addr; + * if (pending) + * XEN_HYPER_SSM_I + * } + */ +/* 4 bundles */ +DEFINE_VOID_FUNC0(ssm_i, + "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" + ";;\n" + "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I_ADDR */ + ";;\n" + "ld1.acq r9 = [r8]\n" /* r9 = XEN_PSR_I */ + ";;\n" + "st1.rel [r8] = r0, -1\n" /* psr_i = 0. enable interrupt + * r8 = XEN_PSR_I_ADDR - 1 + * = pend_int_addr + */ + "cmp.eq.unc p0, p6 = r9, r0\n"/* p6 = !XEN_PSR_I + * previously interrupt + * masked? + */ + ";;\n" + "(p6) ld1.acq r8 = [r8]\n" /* r8 = xen_pend_int */ + ";;\n" + "(p6) cmp.eq.unc p6, p7 = r8, r0\n" /*interrupt pending?*/ + ";;\n" + /* issue hypercall to get interrupt */ + "(p7) break " __stringify(HYPERPRIVOP_SSM_I) "\n" + ";;\n"); + +/* + * psr_i_addr_addr = XEN_MAPPEDREGS->interrupt_mask_addr + * = XEN_PSR_I_ADDR_ADDR; + * psr_i_addr = *psr_i_addr_addr; + * *psr_i_addr = 1; + */ +/* 2 bundles */ +DEFINE_VOID_FUNC0(rsm_i, + "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" + /* r8 = XEN_PSR_I_ADDR */ + "mov r9 = 1\n" + ";;\n" + "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I */ + ";;\n" + "st1.rel [r8] = r9\n"); /* XEN_PSR_I = 1 */ + +extern void +xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1, + unsigned long val2, unsigned long val3, + unsigned long val4); +__DEFINE_FUNC(set_rr0_to_rr4, + "break " __stringify(HYPERPRIVOP_SET_RR0_TO_RR4) "\n"); + + +extern unsigned long xen_getreg(int regnum); +#define __DEFINE_GET_REG(id, privop) \ + "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ + ";;\n" \ + "cmp.eq p6, p0 = r2, r8\n" \ + ";;\n" \ + "(p6) break " __stringify(HYPERPRIVOP_GET_ ## privop) "\n" \ + "(p6) br.cond.sptk.many b6\n" \ + ";;\n" + +__DEFINE_FUNC(getreg, + __DEFINE_GET_REG(PSR, PSR) +#ifdef CONFIG_IA32_SUPPORT + __DEFINE_GET_REG(AR_EFLAG, EFLAG) +#endif + + /* get_itc */ + "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n" + ";;\n" + "cmp.eq p6, p0 = r2, r8\n" + ";;\n" + "(p6) br.cond.spnt xen_get_itc\n" + ";;\n" + + /* get itm */ + "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n" + ";;\n" + "cmp.eq p6, p0 = r2, r8\n" + ";;\n" + "(p6) br.cond.spnt xen_get_itm_with_offset\n" + ";;\n" + + __DEFINE_GET_REG(CR_IVR, IVR) + __DEFINE_GET_REG(CR_TPR, TPR) + + /* fall back */ + "movl r2 = ia64_native_getreg_func\n" + ";;\n" + "mov b7 = r2\n" + ";;\n" + "br.cond.sptk.many b7\n"); + +extern void xen_setreg(int regnum, unsigned long val); +#define __DEFINE_SET_REG(id, privop) \ + "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ + ";;\n" \ + "cmp.eq p6, p0 = r2, r9\n" \ + ";;\n" \ + "(p6) break " __stringify(HYPERPRIVOP_ ## privop) "\n" \ + "(p6) br.cond.sptk.many b6\n" \ + ";;\n" + +__DEFINE_FUNC(setreg, + /* kr0 .. kr 7*/ + /* + * if (_IA64_REG_AR_KR0 <= regnum && + * regnum <= _IA64_REG_AR_KR7) { + * register __index asm ("r8") = regnum - _IA64_REG_AR_KR0 + * register __val asm ("r9") = val + * "break HYPERPRIVOP_SET_KR" + * } + */ + "mov r17 = r9\n" + "mov r2 = " __stringify(_IA64_REG_AR_KR0) "\n" + ";;\n" + "cmp.ge p6, p0 = r9, r2\n" + "sub r17 = r17, r2\n" + ";;\n" + "(p6) cmp.ge.unc p7, p0 = " + __stringify(_IA64_REG_AR_KR7) " - " __stringify(_IA64_REG_AR_KR0) + ", r17\n" + ";;\n" + "(p7) mov r9 = r8\n" + ";;\n" + "(p7) mov r8 = r17\n" + "(p7) break " __stringify(HYPERPRIVOP_SET_KR) "\n" + + /* set itm */ + "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n" + ";;\n" + "cmp.eq p6, p0 = r2, r8\n" + ";;\n" + "(p6) br.cond.spnt xen_set_itm_with_offset\n" + + /* set itc */ + "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n" + ";;\n" + "cmp.eq p6, p0 = r2, r8\n" + ";;\n" + "(p6) br.cond.spnt xen_set_itc\n" + +#ifdef CONFIG_IA32_SUPPORT + __DEFINE_SET_REG(AR_EFLAG, SET_EFLAG) +#endif + __DEFINE_SET_REG(CR_TPR, SET_TPR) + __DEFINE_SET_REG(CR_EOI, EOI) + + /* fall back */ + "movl r2 = ia64_native_setreg_func\n" + ";;\n" + "mov b7 = r2\n" + ";;\n" + "br.cond.sptk.many b7\n"); +#endif static const struct pv_cpu_ops xen_cpu_ops __initconst = { .fc = xen_fc, @@ -486,3 +902,252 @@ xen_setup_pv_ops(void) paravirt_cpu_asm_init(&xen_cpu_asm_switch); } + +#ifdef ASM_SUPPORTED +/*************************************************************************** + * binary pacthing + * pv_init_ops.patch_bundle + */ + +#define DEFINE_FUNC_GETREG(name, privop) \ + DEFINE_FUNC0(get_ ## name, \ + "break "__stringify(HYPERPRIVOP_GET_ ## privop) "\n") + +DEFINE_FUNC_GETREG(psr, PSR); +DEFINE_FUNC_GETREG(eflag, EFLAG); +DEFINE_FUNC_GETREG(ivr, IVR); +DEFINE_FUNC_GETREG(tpr, TPR); + +#define DEFINE_FUNC_SET_KR(n) \ + DEFINE_VOID_FUNC0(set_kr ## n, \ + ";;\n" \ + "mov r9 = r8\n" \ + "mov r8 = " #n "\n" \ + "break " __stringify(HYPERPRIVOP_SET_KR) "\n") + +DEFINE_FUNC_SET_KR(0); +DEFINE_FUNC_SET_KR(1); +DEFINE_FUNC_SET_KR(2); +DEFINE_FUNC_SET_KR(3); +DEFINE_FUNC_SET_KR(4); +DEFINE_FUNC_SET_KR(5); +DEFINE_FUNC_SET_KR(6); +DEFINE_FUNC_SET_KR(7); + +#define __DEFINE_FUNC_SETREG(name, privop) \ + DEFINE_VOID_FUNC0(name, \ + "break "__stringify(HYPERPRIVOP_ ## privop) "\n") + +#define DEFINE_FUNC_SETREG(name, privop) \ + __DEFINE_FUNC_SETREG(set_ ## name, SET_ ## privop) + +DEFINE_FUNC_SETREG(eflag, EFLAG); +DEFINE_FUNC_SETREG(tpr, TPR); +__DEFINE_FUNC_SETREG(eoi, EOI); + +extern const char xen_check_events[]; +extern const char __xen_intrin_local_irq_restore_direct_start[]; +extern const char __xen_intrin_local_irq_restore_direct_end[]; +extern const unsigned long __xen_intrin_local_irq_restore_direct_reloc; + +asm ( + ".align 32\n" + ".proc xen_check_events\n" + "xen_check_events:\n" + /* masked = 0 + * r9 = masked_addr - 1 + * = pending_intr_addr + */ + "st1.rel [r9] = r0, -1\n" + ";;\n" + /* r8 = pending_intr */ + "ld1.acq r11 = [r9]\n" + ";;\n" + /* p9 = interrupt pending? */ + "cmp.ne p9, p10 = r11, r0\n" + ";;\n" + "(p10) mf\n" + /* issue hypercall to trigger interrupt */ + "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n" + "br.cond.sptk.many b6\n" + ".endp xen_check_events\n" + "\n" + ".align 32\n" + ".proc __xen_intrin_local_irq_restore_direct\n" + "__xen_intrin_local_irq_restore_direct:\n" + "__xen_intrin_local_irq_restore_direct_start:\n" + "1:\n" + "{\n" + "cmp.ne p6, p7 = r8, r0\n" + "mov r17 = ip\n" /* get ip to calc return address */ + "mov r9 = "__stringify(XEN_PSR_I_ADDR_ADDR) "\n" + ";;\n" + "}\n" + "{\n" + /* r9 = XEN_PSR_I_ADDR */ + "ld8 r9 = [r9]\n" + ";;\n" + /* r10 = masked previous value */ + "(p6) ld1.acq r10 = [r9]\n" + "adds r17 = 1f - 1b, r17\n" /* calculate return address */ + ";;\n" + "}\n" + "{\n" + /* p8 = !masked interrupt masked previously? */ + "(p6) cmp.ne.unc p8, p0 = r10, r0\n" + "\n" + /* p7 = else clause */ + "(p7) mov r11 = 1\n" + ";;\n" + "(p8) mov b6 = r17\n" /* set return address */ + "}\n" + "{\n" + /* masked = 1 */ + "(p7) st1.rel [r9] = r11\n" + "\n" + "[99:]\n" + "(p8) brl.cond.dptk.few xen_check_events\n" + "}\n" + /* pv calling stub is 5 bundles. fill nop to adjust return address */ + "{\n" + "nop 0\n" + "nop 0\n" + "nop 0\n" + "}\n" + "1:\n" + "__xen_intrin_local_irq_restore_direct_end:\n" + ".endp __xen_intrin_local_irq_restore_direct\n" + "\n" + ".align 8\n" + "__xen_intrin_local_irq_restore_direct_reloc:\n" + "data8 99b\n" +); + +static struct paravirt_patch_bundle_elem xen_patch_bundle_elems[] +__initdata_or_module = +{ +#define XEN_PATCH_BUNDLE_ELEM(name, type) \ + { \ + (void*)xen_ ## name ## _direct_start, \ + (void*)xen_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_ ## type, \ + } + + XEN_PATCH_BUNDLE_ELEM(fc, FC), + XEN_PATCH_BUNDLE_ELEM(thash, THASH), + XEN_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID), + XEN_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD), + XEN_PATCH_BUNDLE_ELEM(ptcga, PTCGA), + XEN_PATCH_BUNDLE_ELEM(get_rr, GET_RR), + XEN_PATCH_BUNDLE_ELEM(set_rr, SET_RR), + XEN_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4), + XEN_PATCH_BUNDLE_ELEM(ssm_i, SSM_I), + XEN_PATCH_BUNDLE_ELEM(rsm_i, RSM_I), + XEN_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I), + { + (void*)__xen_intrin_local_irq_restore_direct_start, + (void*)__xen_intrin_local_irq_restore_direct_end, + PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE, + }, + +#define XEN_PATCH_BUNDLE_ELEM_GETREG(name, reg) \ + { \ + xen_get_ ## name ## _direct_start, \ + xen_get_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \ + } + + XEN_PATCH_BUNDLE_ELEM_GETREG(psr, PSR), + XEN_PATCH_BUNDLE_ELEM_GETREG(eflag, AR_EFLAG), + + XEN_PATCH_BUNDLE_ELEM_GETREG(ivr, CR_IVR), + XEN_PATCH_BUNDLE_ELEM_GETREG(tpr, CR_TPR), + + XEN_PATCH_BUNDLE_ELEM_GETREG(itc, AR_ITC), + XEN_PATCH_BUNDLE_ELEM_GETREG(itm_with_offset, CR_ITM), + + +#define __XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ + { \ + xen_ ## name ## _direct_start, \ + xen_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \ + } + +#define XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ + __XEN_PATCH_BUNDLE_ELEM_SETREG(set_ ## name, reg) + + XEN_PATCH_BUNDLE_ELEM_SETREG(kr0, AR_KR0), + XEN_PATCH_BUNDLE_ELEM_SETREG(kr1, AR_KR1), + XEN_PATCH_BUNDLE_ELEM_SETREG(kr2, AR_KR2), + XEN_PATCH_BUNDLE_ELEM_SETREG(kr3, AR_KR3), + XEN_PATCH_BUNDLE_ELEM_SETREG(kr4, AR_KR4), + XEN_PATCH_BUNDLE_ELEM_SETREG(kr5, AR_KR5), + XEN_PATCH_BUNDLE_ELEM_SETREG(kr6, AR_KR6), + XEN_PATCH_BUNDLE_ELEM_SETREG(kr7, AR_KR7), + + XEN_PATCH_BUNDLE_ELEM_SETREG(eflag, AR_EFLAG), + XEN_PATCH_BUNDLE_ELEM_SETREG(tpr, CR_TPR), + __XEN_PATCH_BUNDLE_ELEM_SETREG(eoi, CR_EOI), + + XEN_PATCH_BUNDLE_ELEM_SETREG(itc, AR_ITC), + XEN_PATCH_BUNDLE_ELEM_SETREG(itm_with_offset, CR_ITM), +}; + +static unsigned long __init_or_module +xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type) +{ + const unsigned long nelems = sizeof(xen_patch_bundle_elems) / + sizeof(xen_patch_bundle_elems[0]); + unsigned long used; + const struct paravirt_patch_bundle_elem *found; + + used = __paravirt_patch_apply_bundle(sbundle, ebundle, type, + xen_patch_bundle_elems, nelems, + &found); + + if (found == NULL) + /* fallback */ + return ia64_native_patch_bundle(sbundle, ebundle, type); + if (used == 0) + return used; + + /* relocation */ + switch (type) { + case PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE: { + unsigned long reloc = + __xen_intrin_local_irq_restore_direct_reloc; + unsigned long reloc_offset = reloc - (unsigned long) + __xen_intrin_local_irq_restore_direct_start; + unsigned long tag = (unsigned long)sbundle + reloc_offset; + paravirt_patch_reloc_brl(tag, xen_check_events); + break; + } + default: + /* nothing */ + break; + } + return used; +} +#endif /* ASM_SUPPOTED */ + +const struct paravirt_patch_branch_target xen_branch_target[] +__initconst = { +#define PARAVIRT_BR_TARGET(name, type) \ + { \ + &xen_ ## name, \ + PARAVIRT_PATCH_TYPE_BR_ ## type, \ + } + PARAVIRT_BR_TARGET(switch_to, SWITCH_TO), + PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL), + PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL), + PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL), +}; + +static void __init +xen_patch_branch(unsigned long tag, unsigned long type) +{ + const unsigned long nelem = + sizeof(xen_branch_target) / sizeof(xen_branch_target[0]); + __paravirt_patch_apply_branch(tag, type, xen_branch_target, nelem); +} From 853346e4354c948b50a6fb0002f8af2cf5fbf2ae Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 21 Mar 2009 22:05:11 +0800 Subject: [PATCH 184/478] PCI: fix conflict between SR-IOV and config space sizing New pci_cfg_space_size() needs invalid pdev->class, put it in the right place in the pci_setup_device(). Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 56c71e585f3d..e2f3dd098cfa 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -713,7 +713,6 @@ int pci_setup_device(struct pci_dev *dev) dev->dev.bus = &pci_bus_type; dev->hdr_type = hdr_type & 0x7f; dev->multifunction = !!(hdr_type & 0x80); - dev->cfg_size = pci_cfg_space_size(dev); dev->error_state = pci_channel_io_normal; set_pcie_port_type(dev); @@ -738,6 +737,9 @@ int pci_setup_device(struct pci_dev *dev) dev_dbg(&dev->dev, "found [%04x:%04x] class %06x header type %02x\n", dev->vendor, dev->device, class, dev->hdr_type); + /* need to have dev->class ready */ + dev->cfg_size = pci_cfg_space_size(dev); + /* "Unknown power state" */ dev->current_state = PCI_UNKNOWN; @@ -959,9 +961,6 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) return NULL; } - /* need to have dev->class ready */ - dev->cfg_size = pci_cfg_space_size(dev); - return dev; } From 7ae0567fd3f4f51d55c4c638ecc6836347992de2 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Thu, 26 Mar 2009 16:49:52 +0900 Subject: [PATCH 185/478] PCI: fix kernel oops on bridge removal Fix the following kernel oops problem that happens when removing PCI bridge with pciehp loaded. It should also occur with other hotplug driver that is implemented as a bridge's driver. [ 459.997257] pciehp 0000:2f:04.0:pcie24: unloading service driver pciehp [ 459.997495] general protection fault: 0000 [#1] SMP [ 459.997737] last sysfs file: /sys/devices/pci0000:00/0000:00:04.0/0000:2e:00.0/0000:2f:04.0/remove [ 459.997964] CPU 4 [ 459.998129] Modules linked in: pciehp ipv6 autofs4 hidp rfcomm l2cap bluetooth sunrpc cpufreq_ondemand acpi_cpufreq dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod sbs sbshc battery ac parport_pc lp parport mptspi mptscsih mptbase scsi_transport_spi e1000e sg sr_mod cdrom button serio_raw i2c_i801 i2c_core shpchp pcspkr ata_piix libata megaraid_sas sd_mod scsi_mod crc_t10dif ext3 jbd uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode] [ 459.998129] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1 PRIMERGY [ 459.998129] RIP: 0010:[] [] pci_slot_release+0x37/0x100 [ 459.998129] RSP: 0018:ffff88083b3bf9e0 EFLAGS: 00010246 [ 459.998129] RAX: ffff88083adc5158 RBX: ffff880836c1bc80 RCX: 6b6b6b6b6b6b6b6b [ 459.998129] RDX: 0000000000000000 RSI: ffffffff803a77f0 RDI: ffff880836c1bc48 [ 459.998129] RBP: ffff88083b3bfa00 R08: 0000000000000002 R09: 0000000000000000 [ 459.998129] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880836c1bc48 [ 459.998129] R13: ffff880836c1bc20 R14: ffff880836c1bc48 R15: ffff880836d1ec38 [ 459.998129] FS: 0000000000000000(0000) GS:ffff88083ccc3770(0000) knlGS:0000000000000000 [ 459.998129] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b [ 459.998129] CR2: 00007f1562f1d558 CR3: 0000000838090000 CR4: 00000000000006e0 [ 459.998129] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 459.998129] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 459.998129] Process events/4 (pid: 56, threadinfo ffff88083b3be000, task ffff88083b3b3e40) [ 459.998129] Stack: [ 459.998129] ffff880836c1bc80 ffff880836c1bc48 ffffffff80793320 ffff88083b0d0960 [ 459.998129] ffff88083b3bfa30 ffffffff803a788a ffff880836c1bc80 ffffffff803a77f0 [ 459.998129] ffff880836c1bc20 ffff880836d1ec38 ffff88083b3bfa50 ffffffff803a8ce7 [ 459.998129] Call Trace: [ 459.998129] [] kobject_release+0x9a/0x290 [ 459.998129] [] ? kobject_release+0x0/0x290 [ 459.998129] [] kref_put+0x37/0x80 [ 459.998129] [] kobject_put+0x27/0x60 [ 459.998129] [] ? pci_destroy_slot+0x3c/0xc0 [ 459.998129] [] pci_destroy_slot+0x45/0xc0 [ 459.998129] [] pci_hp_deregister+0x13d/0x210 [ 459.998129] [] cleanup_slots+0x2d/0x80 [pciehp] [ 459.998129] [] pciehp_remove+0x15/0x30 [pciehp] [ 459.998129] [] pcie_port_remove_service+0x69/0x90 [ 459.998129] [] __device_release_driver+0x59/0x90 [ 459.998129] [] device_release_driver+0x2b/0x40 [ 459.998129] [] bus_remove_device+0xa6/0x120 [ 459.998129] [] device_del+0x12b/0x190 [ 459.998129] [] ? remove_iter+0x0/0x40 [ 459.998129] [] device_unregister+0x26/0x70 [ 459.998129] [] remove_iter+0x2f/0x40 [ 459.998129] [] device_for_each_child+0x33/0x60 [ 459.998129] [] ? sysfs_schedule_callback_work+0x0/0x50 [ 459.998129] [] pcie_port_device_remove+0x30/0x80 [ 459.998129] [] pcie_portdrv_remove+0x11/0x20 [ 459.998129] [] pci_device_remove+0x32/0x70 [ 459.998129] [] __device_release_driver+0x59/0x90 [ 459.998129] [] device_release_driver+0x2b/0x40 [ 459.998129] [] bus_remove_device+0xa6/0x120 [ 459.998129] [] device_del+0x12b/0x190 [ 459.998129] [] device_unregister+0x26/0x70 [ 459.998129] [] pci_stop_dev+0x49/0x60 [ 459.998129] [] pci_remove_bus_device+0x40/0xc0 [ 459.998129] [] remove_callback+0x29/0x40 [ 459.998129] [] sysfs_schedule_callback_work+0x1f/0x50 [ 459.998129] [] run_workqueue+0x15a/0x230 [ 459.998129] [] ? run_workqueue+0x108/0x230 [ 459.998129] [] worker_thread+0x9f/0x100 [ 459.998129] [] ? autoremove_wake_function+0x0/0x40 [ 459.998129] [] ? worker_thread+0x0/0x100 [ 459.998129] [] kthread+0x4d/0x80 [ 459.998129] [] child_rip+0xa/0x20 [ 459.998129] [] ? restore_args+0x0/0x30 [ 459.998129] [] ? kthread+0x0/0x80 [ 459.998129] [] ? child_rip+0x0/0x20 [ 459.998129] Code: 56 49 89 fe 41 55 4c 8d 6f d8 41 54 53 74 09 f6 05 b8 05 c7 00 08 75 72 49 8b 45 00 48 8b 48 28 eb 05 66 90 48 89 f1 49 8b 45 00 <48> 8b 31 48 83 c0 28 0f 18 0e 48 39 c1 74 1c 8b 41 38 41 0f b6 [ 459.998129] RIP [] pci_slot_release+0x37/0x100 [ 459.998129] RSP [ 460.018595] ---[ end trace 5a08d2095374aedc ]--- The pci_remove_bus_device() removes all buses and devices under the bridge, and then removes the bridge. So the remove() callback of the hotplug drivers implemented as a bridge's driver is executed after the struct pci_bus of the bridge's secondary bus is removed. The remove() callback of those driver unregisters the slot using pci_destroy_slot(), and slot's release callback refers to the the struct pci_bus that was already freed. This is the cause of the kernel oops. This patch solves the problem by stopping bus drivers before removing the bridge and its child bus and devices. Acked-by: Alex Chiang Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes --- drivers/pci/remove.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index caf8e1eae45e..86503c14ce7e 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -95,6 +95,7 @@ EXPORT_SYMBOL(pci_remove_bus); */ void pci_remove_bus_device(struct pci_dev *dev) { + pci_stop_bus_device(dev); if (dev->subordinate) { struct pci_bus *b = dev->subordinate; From 7bb2cb3e90dc49be1cd14956c155451499c857a7 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 26 Mar 2009 18:34:33 +1100 Subject: [PATCH 186/478] PCI: update fakephp for bus_id removal Get rid of a new use of bus_id that snuck in. Signed-off-by: Stephen Rothwell Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/fakephp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c index 2dc7828df480..6151389fd903 100644 --- a/drivers/pci/hotplug/fakephp.c +++ b/drivers/pci/hotplug/fakephp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../pci.h" struct legacy_slot { @@ -88,7 +89,7 @@ static int legacy_add_slot(struct pci_dev *pdev) if (kobject_init_and_add(&slot->kobj, &legacy_ktype, &pci_slots_kset->kobj, "%s", - pdev->dev.bus_id)) { + dev_name(&pdev->dev))) { dev_warn(&pdev->dev, "Failed to created legacy fake slot\n"); return -EINVAL; } From 898585172fa729513d8636257b44bd1cfd279096 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 16 Feb 2009 02:55:47 +0800 Subject: [PATCH 187/478] PCI: save and restore PCIe 2.0 registers PCIe 2.0 defines several new registers (Device Control 2, Link Control 2, and Slot Control 2). Save and retore them in pci_save_pcie_state() and pci_restore_pcie_state(). Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 11 ++++++++++- include/linux/pci_regs.h | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 676bbcbc272b..59569b8cf1d5 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -647,6 +647,8 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) EXPORT_SYMBOL(pci_choose_state); +#define PCI_EXP_SAVE_REGS 7 + static int pci_save_pcie_state(struct pci_dev *dev) { int pos, i = 0; @@ -668,6 +670,9 @@ static int pci_save_pcie_state(struct pci_dev *dev) pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]); pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]); pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]); + pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &cap[i++]); + pci_read_config_word(dev, pos + PCI_EXP_LNKCTL2, &cap[i++]); + pci_read_config_word(dev, pos + PCI_EXP_SLTCTL2, &cap[i++]); return 0; } @@ -688,6 +693,9 @@ static void pci_restore_pcie_state(struct pci_dev *dev) pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]); pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]); pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]); + pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, cap[i++]); + pci_write_config_word(dev, pos + PCI_EXP_LNKCTL2, cap[i++]); + pci_write_config_word(dev, pos + PCI_EXP_SLTCTL2, cap[i++]); } @@ -1372,7 +1380,8 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) { int error; - error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, 4 * sizeof(u16)); + error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, + PCI_EXP_SAVE_REGS * sizeof(u16)); if (error) dev_err(&dev->dev, "unable to preallocate PCI Express save buffer\n"); diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index d4e663877f45..e4d08c1b2e0b 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -488,6 +488,8 @@ #define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ #define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ #define PCI_EXP_DEVCTL2_ARI 0x20 /* Alternative Routing-ID */ +#define PCI_EXP_LNKCTL2 48 /* Link Control 2 */ +#define PCI_EXP_SLTCTL2 56 /* Slot Control 2 */ /* Extended Capabilities (PCI-X 2.0 and Express) */ #define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) From e42d1fe804408c57506a5326252b4db29958a7fb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Mar 2009 13:26:13 -0700 Subject: [PATCH 188/478] x86/PCI: make pci=lastbus=255 work when acpi is on Impact: scan more peer root buses even acpi is used Move pci_bios_fixup_peer_bridges out of pci_legacy_init and into pci_subsys_init. This allows pci_bios_fixup_peer_bridges to be called even pci_apci_init is driving PCI initialization. Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/legacy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index f1065b129e9c..4061bb0f267d 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -50,8 +50,6 @@ static int __init pci_legacy_init(void) if (pci_root_bus) pci_bus_add_devices(pci_root_bus); - pcibios_fixup_peer_bridges(); - return 0; } @@ -67,6 +65,7 @@ int __init pci_subsys_init(void) pci_visws_init(); #endif pci_legacy_init(); + pcibios_fixup_peer_bridges(); pcibios_irq_init(); pcibios_init(); From de7453065d5d5243686467998f1fcc58d20e0a7c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 20 Mar 2009 19:29:41 -0700 Subject: [PATCH 189/478] PCI: don't enable too much HT MSI mapping Impact: fix bug Prakash reported that his c51-mcp51 system ondie sound card doesn't work MSI but if he hack out the HT-MSI on mcp51, the MSI will work well with sound card. This patch reworks nv_msi_ht_cap_quirk() and will only avoid enabling ht_msi on devices following that root device. Reported-by: Prakash Punnoor Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 118 ++++++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 36 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 7ddcfc65e790..faf02dd00015 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2147,6 +2147,65 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_15, nvenet_msi_disable); +static int __devinit ht_check_msi_mapping(struct pci_dev *dev) +{ + int pos, ttl = 48; + int found = 0; + + /* check if there is HT MSI cap or enabled on this device */ + pos = pci_find_ht_capability(dev, HT_CAPTYPE_MSI_MAPPING); + while (pos && ttl--) { + u8 flags; + + if (found < 1) + found = 1; + if (pci_read_config_byte(dev, pos + HT_MSI_FLAGS, + &flags) == 0) { + if (flags & HT_MSI_FLAGS_ENABLE) { + if (found < 2) { + found = 2; + break; + } + } + } + pos = pci_find_next_ht_capability(dev, pos, + HT_CAPTYPE_MSI_MAPPING); + } + + return found; +} + +static int __devinit host_bridge_with_leaf(struct pci_dev *host_bridge) +{ + struct pci_dev *dev; + int pos; + int i, dev_no; + int found = 0; + + dev_no = host_bridge->devfn >> 3; + for (i = dev_no + 1; i < 0x20; i++) { + dev = pci_get_slot(host_bridge->bus, PCI_DEVFN(i, 0)); + if (!dev) + continue; + + /* found next host bridge ?*/ + pos = pci_find_ht_capability(dev, HT_CAPTYPE_SLAVE); + if (pos != 0) { + pci_dev_put(dev); + break; + } + + if (ht_check_msi_mapping(dev)) { + found = 1; + pci_dev_put(dev); + break; + } + pci_dev_put(dev); + } + + return found; +} + static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev) { struct pci_dev *host_bridge; @@ -2171,6 +2230,10 @@ static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev) if (!found) return; + /* don't enable host_bridge with leaf directly here */ + if (host_bridge == dev && host_bridge_with_leaf(host_bridge)) + goto out; + /* root did that ! */ if (msi_ht_cap_enabled(host_bridge)) goto out; @@ -2201,44 +2264,12 @@ static void __devinit ht_disable_msi_mapping(struct pci_dev *dev) } } -static int __devinit ht_check_msi_mapping(struct pci_dev *dev) -{ - int pos, ttl = 48; - int found = 0; - - /* check if there is HT MSI cap or enabled on this device */ - pos = pci_find_ht_capability(dev, HT_CAPTYPE_MSI_MAPPING); - while (pos && ttl--) { - u8 flags; - - if (found < 1) - found = 1; - if (pci_read_config_byte(dev, pos + HT_MSI_FLAGS, - &flags) == 0) { - if (flags & HT_MSI_FLAGS_ENABLE) { - if (found < 2) { - found = 2; - break; - } - } - } - pos = pci_find_next_ht_capability(dev, pos, - HT_CAPTYPE_MSI_MAPPING); - } - - return found; -} - -static void __devinit nv_msi_ht_cap_quirk(struct pci_dev *dev) +static void __devinit __nv_msi_ht_cap_quirk(struct pci_dev *dev, int all) { struct pci_dev *host_bridge; int pos; int found; - /* Enabling HT MSI mapping on this device breaks MCP51 */ - if (dev->device == 0x270) - return; - /* check if there is HT MSI cap or enabled on this device */ found = ht_check_msi_mapping(dev); @@ -2262,7 +2293,10 @@ static void __devinit nv_msi_ht_cap_quirk(struct pci_dev *dev) /* Host bridge is to HT */ if (found == 1) { /* it is not enabled, try to enable it */ - nv_ht_enable_msi_mapping(dev); + if (all) + ht_enable_msi_mapping(dev); + else + nv_ht_enable_msi_mapping(dev); } return; } @@ -2274,8 +2308,20 @@ static void __devinit nv_msi_ht_cap_quirk(struct pci_dev *dev) /* Host bridge is not to HT, disable HT MSI mapping on this device */ ht_disable_msi_mapping(dev); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, nv_msi_ht_cap_quirk); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL, PCI_ANY_ID, nv_msi_ht_cap_quirk); + +static void __devinit nv_msi_ht_cap_quirk_all(struct pci_dev *dev) +{ + return __nv_msi_ht_cap_quirk(dev, 1); +} + +static void __devinit nv_msi_ht_cap_quirk_leaf(struct pci_dev *dev) +{ + return __nv_msi_ht_cap_quirk(dev, 0); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, nv_msi_ht_cap_quirk_leaf); + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL, PCI_ANY_ID, nv_msi_ht_cap_quirk_all); static void __devinit quirk_msi_intx_disable_bug(struct pci_dev *dev) { From b46a0b08b8bdf6467cd2b49f520e100c72885302 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 27 Mar 2009 15:10:33 +0900 Subject: [PATCH 190/478] ia64/xen: fix the link error. This patch fixes the following link error with xen_domu_defconfig. Depending on compiler version, it doesn't link as follows. So remove const and use __initdata for xen_iosapic_ops. > arch/ia64/xen/xen_pv_ops.c:878: error: xen_iosapic_ops causes a section type conflict Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/xen/xen_pv_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c index 6c44225e7b84..bf3c74cb150f 100644 --- a/arch/ia64/xen/xen_pv_ops.c +++ b/arch/ia64/xen/xen_pv_ops.c @@ -875,7 +875,7 @@ xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val) HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op); } -static const struct pv_iosapic_ops xen_iosapic_ops __initconst = { +static struct pv_iosapic_ops xen_iosapic_ops __initdata = { .pcat_compat_init = xen_pcat_compat_init, .__get_irq_chip = xen_iosapic_get_irq_chip, From 7120569c76028a6883697b7643564f0c419cfe07 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 27 Mar 2009 15:11:57 +0900 Subject: [PATCH 191/478] ia64: remove some warnings. This patch removes the following warnings and related ones. Plus some cosmetics. arch/ia64/kernel/patch.c:112: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast arch/ia64/kernel/patch.c:135: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast arch/ia64/kernel/patch.c:166: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast arch/ia64/kernel/patch.c:202: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast arch/ia64/kernel/patch.c:220: warning: passing argument 1 of 'paravirt_fc' makes integer from pointer without a cast Signed-off-by: Isaku Yamahata Signed-off-by: Tony Luck --- arch/ia64/include/asm/paravirt_privop.h | 22 +++++++++++++++++++--- arch/ia64/include/asm/xen/privop.h | 2 +- arch/ia64/kernel/paravirt.c | 22 +++++++++++++++++----- arch/ia64/kernel/patch.c | 2 +- arch/ia64/kvm/kvm-ia64.c | 2 +- arch/ia64/kvm/vcpu.c | 2 +- arch/ia64/xen/xen_pv_ops.c | 9 +++++++-- 7 files changed, 47 insertions(+), 14 deletions(-) diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h index 4e40e62c4ab8..3d2951130b5f 100644 --- a/arch/ia64/include/asm/paravirt_privop.h +++ b/arch/ia64/include/asm/paravirt_privop.h @@ -33,7 +33,7 @@ */ struct pv_cpu_ops { - void (*fc)(unsigned long addr); + void (*fc)(void *addr); unsigned long (*thash)(unsigned long addr); unsigned long (*get_cpuid)(int index); unsigned long (*get_pmd)(int index); @@ -248,7 +248,7 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); "r15", "r16", "r17" #define PARAVIRT_REG_CLOBBERS1 \ - "r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14", \ + "r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14", \ "r15", "r16", "r17" #define PARAVIRT_REG_CLOBBERS2 \ @@ -330,6 +330,15 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); : PARAVIRT_OP(op), "0"(__##arg1) \ : PARAVIRT_CLOBBERS1) +#define PARAVIRT_BR1_VOID(op, type, arg1) \ + register void *__##arg1 asm ("r8") = arg1; \ + register unsigned long ia64_clobber asm ("r8"); \ + asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ + PARAVIRT_TYPE(type)) \ + : "=r"(ia64_clobber) \ + : PARAVIRT_OP(op), "0"(__##arg1) \ + : PARAVIRT_CLOBBERS1) + #define PARAVIRT_BR2(op, type, arg1, arg2) \ register unsigned long __##arg1 asm ("r8") = arg1; \ register unsigned long __##arg2 asm ("r9") = arg2; \ @@ -357,6 +366,13 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); return ia64_intri_res; \ } +#define PARAVIRT_DEFINE_CPU_OP1_VOID(op, type) \ + static inline void \ + paravirt_ ## op (void *arg1) \ + { \ + PARAVIRT_BR1_VOID(op, type, arg1); \ + } + #define PARAVIRT_DEFINE_CPU_OP1(op, type) \ static inline void \ paravirt_ ## op (unsigned long arg1) \ @@ -381,7 +397,7 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); } -PARAVIRT_DEFINE_CPU_OP1(fc, FC); +PARAVIRT_DEFINE_CPU_OP1_VOID(fc, FC); PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH) PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID) PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD) diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h index e5fbaeeb161a..fb4ec5e0b066 100644 --- a/arch/ia64/include/asm/xen/privop.h +++ b/arch/ia64/include/asm/xen/privop.h @@ -69,7 +69,7 @@ * may have different semantics depending on whether they are executed * at PL0 vs PL!=0. When paravirtualized, these instructions mustn't * be allowed to execute directly, lest incorrect semantics result. */ -extern void xen_fc(unsigned long addr); +extern void xen_fc(void *addr); extern unsigned long xen_thash(unsigned long addr); /* Note that "ttag" and "cover" are also privilege-sensitive; "ttag" diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 158d52414e96..a21d7bb9c69c 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -70,7 +70,14 @@ struct pv_init_ops pv_init_ops = ia64_native_ ## name ## _func(unsigned long arg) \ { \ ia64_native_ ## name(arg); \ - } \ + } + +#define DEFINE_VOID_FUNC1_VOID(name) \ + static void \ + ia64_native_ ## name ## _func(void *arg) \ + { \ + ia64_native_ ## name(arg); \ + } #define DEFINE_VOID_FUNC2(name) \ static void \ @@ -78,7 +85,7 @@ struct pv_init_ops pv_init_ops = unsigned long arg1) \ { \ ia64_native_ ## name(arg0, arg1); \ - } \ + } #define DEFINE_FUNC0(name) \ static unsigned long \ @@ -94,7 +101,7 @@ struct pv_init_ops pv_init_ops = return ia64_native_ ## name(arg); \ } \ -DEFINE_VOID_FUNC1(fc); +DEFINE_VOID_FUNC1_VOID(fc); DEFINE_VOID_FUNC1(intrin_local_irq_restore); DEFINE_VOID_FUNC2(ptcga); @@ -308,6 +315,11 @@ ia64_native_setreg_func(int regnum, unsigned long val) ia64_native_ ## name ## _func(unsigned long arg); \ __DEFINE_FUNC(name, code) +#define DEFINE_VOID_FUNC1_VOID(name, code) \ + extern void \ + ia64_native_ ## name ## _func(void *arg); \ + __DEFINE_FUNC(name, code) + #define DEFINE_VOID_FUNC2(name, code) \ extern void \ ia64_native_ ## name ## _func(unsigned long arg0, \ @@ -324,8 +336,8 @@ ia64_native_setreg_func(int regnum, unsigned long val) ia64_native_ ## name ## _func(type arg); \ __DEFINE_FUNC(name, code) -DEFINE_VOID_FUNC1(fc, - "fc r8\n"); +DEFINE_VOID_FUNC1_VOID(fc, + "fc r8\n"); DEFINE_VOID_FUNC1(intrin_local_irq_restore, ";;\n" " cmp.ne p6, p7 = r8, r0\n" diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c index 64c6f95daa34..68a1311db806 100644 --- a/arch/ia64/kernel/patch.c +++ b/arch/ia64/kernel/patch.c @@ -249,7 +249,7 @@ void ia64_patch_phys_stack_reg(unsigned long val) while (offp < end) { ip = (u64) offp + *offp; ia64_patch(ip, mask, imm); - ia64_fc(ip); + ia64_fc((void *)ip); ++offp; } ia64_sync_i(); diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 28f982045f29..0344c6664485 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -70,7 +70,7 @@ static void kvm_flush_icache(unsigned long start, unsigned long len) int l; for (l = 0; l < (len + 32); l += 32) - ia64_fc(start + l); + ia64_fc((void *)(start + l)); ia64_sync_i(); ia64_srlz_i(); diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index ecd526b55323..6839a52f7a41 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -390,7 +390,7 @@ void set_rse_reg(struct kvm_pt_regs *regs, unsigned long r1, else *rnat_addr = (*rnat_addr) & (~nat_mask); - ia64_setreg(_IA64_REG_AR_BSPSTORE, bspstore); + ia64_setreg(_IA64_REG_AR_BSPSTORE, (unsigned long)bspstore); ia64_setreg(_IA64_REG_AR_RNAT, rnat); } local_irq_restore(psr); diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c index bf3c74cb150f..5e2270a999fa 100644 --- a/arch/ia64/xen/xen_pv_ops.c +++ b/arch/ia64/xen/xen_pv_ops.c @@ -416,6 +416,11 @@ xen_intrin_local_irq_restore(unsigned long mask) xen_ ## name (unsigned long arg); \ __DEFINE_FUNC(name, code) +#define DEFINE_VOID_FUNC1_VOID(name, code) \ + extern void \ + xen_ ## name (void *arg); \ + __DEFINE_FUNC(name, code) + #define DEFINE_VOID_FUNC2(name, code) \ extern void \ xen_ ## name (unsigned long arg0, \ @@ -530,8 +535,8 @@ DEFINE_FUNC0(get_itc, "(p6) hint @pause\n" "(p6) br.cond.spnt 888b\n"); -DEFINE_VOID_FUNC1(fc, - "break " __stringify(HYPERPRIVOP_FC) "\n"); +DEFINE_VOID_FUNC1_VOID(fc, + "break " __stringify(HYPERPRIVOP_FC) "\n"); /* * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR From fe2c8191faa29d7a09f4962198f6dfab973ceec4 Mon Sep 17 00:00:00 2001 From: Thiemo Nagel Date: Tue, 31 Mar 2009 08:36:10 -0400 Subject: [PATCH 192/478] ext4: add checks of block references for non-extent inodes Check block references in the inode and indorect blocks for non-extent inodes to make sure they are valid, and flag an error if they are invalid. Signed-off-by: Thiemo Nagel Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 59 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c7d9250fbc3f..b3fd65d46506 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode, return n; } +static int __ext4_check_blockref(const char *function, struct inode *inode, + unsigned int *p, unsigned int max) { + + unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); + unsigned int *bref = p; + while (bref < p+max) { + if (unlikely(*bref >= maxblocks)) { + ext4_error(inode->i_sb, function, + "block reference %u >= max (%u) " + "in inode #%lu, offset=%d", + *bref, maxblocks, + inode->i_ino, (int)(bref-p)); + return -EIO; + } + bref++; + } + return 0; +} + + +#define ext4_check_indirect_blockref(inode, bh) \ + __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_check_inode_blockref(inode) \ + __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question @@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, if (!p->key) goto no_block; while (--depth) { - bh = sb_bread(sb, le32_to_cpu(p->key)); - if (!bh) + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) @@ -4371,11 +4412,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ei->i_flags & EXT4_EXTENTS_FL) { /* Validate extent which is part of inode */ ret = ext4_ext_check_inode(inode); - if (ret) { - brelse(bh); - goto bad_inode; - } - + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_check_inode_blockref(inode); + } + if (ret) { + brelse(bh); + goto bad_inode; } if (S_ISREG(inode->i_mode)) { From cc0fb9ad7dbc5a149f4957a0dd6d65881d3d385b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 27 Mar 2009 17:16:58 -0400 Subject: [PATCH 193/478] ext4: Rename pa_linear to pa_type Impact: code cleanup This patch rename pa_linear to pa_type and add MB_INODE_PA and MB_GROUP_PA to indicate inode and group prealloc space. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 21 ++++++++++++--------- fs/ext4/mballoc.h | 7 +++++-- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e72c72a0b807..7f6fc41a2dde 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3555,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; - /* If linear, pa_pstart may be in the next group when pa is used up */ - if (pa->pa_linear) + /* + * If doing group-based preallocation, pa_pstart may be in the + * next group when pa is used up + */ + if (pa->pa_type == MB_GROUP_PA) grp_blk--; ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); @@ -3651,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_linear = 0; + pa->pa_type = MB_INODE_PA; mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); @@ -3714,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_linear = 1; + pa->pa_type = MB_GROUP_PA; mb_debug("new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); @@ -3968,7 +3971,7 @@ repeat: list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); - if (pa->pa_linear) + if (pa->pa_type == MB_GROUP_PA) ext4_mb_release_group_pa(&e4b, pa, ac); else ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); @@ -4068,7 +4071,7 @@ repeat: spin_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { - BUG_ON(pa->pa_linear != 0); + BUG_ON(pa->pa_type != MB_INODE_PA); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); err = ext4_mb_load_buddy(sb, group, &e4b); @@ -4320,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, continue; } /* only lg prealloc space */ - BUG_ON(!pa->pa_linear); + BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ pa->pa_deleted = 1; @@ -4426,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { - if (pa->pa_linear) { + if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += ac->ac_b_ex.fe_len; @@ -4446,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) * doesn't grow big. We need to release * alloc_semp before calling ext4_mb_add_n_trim() */ - if (pa->pa_linear && likely(pa->pa_free)) { + if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 7acf5ef24537..dd9e6cd5f6cf 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -132,12 +132,15 @@ struct ext4_prealloc_space { ext4_lblk_t pa_lstart; /* log. block */ unsigned short pa_len; /* len of preallocated chunk */ unsigned short pa_free; /* how many blocks are free */ - unsigned short pa_linear; /* consumed in one direction - * strictly, for grp prealloc */ + unsigned short pa_type; /* pa type. inode or group */ spinlock_t *pa_obj_lock; struct inode *pa_inode; /* hack, for history only */ }; +enum { + MB_INODE_PA = 0, + MB_GROUP_PA = 1 +}; struct ext4_free_extent { ext4_lblk_t fe_logical; From 86db97c87f744364d5889ca8a4134ca2048b8f83 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Mar 2009 17:20:40 -0400 Subject: [PATCH 194/478] jbd2: Update locking coments Update information about locking in JBD2 revoke code. Inconsistency in comments found by Lin Tan . CC: Lin Tan . Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/jbd2/revoke.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 257ff2625765..bbe6d592d8b3 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -55,6 +55,25 @@ * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment noone else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. */ #ifndef __KERNEL__ @@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. - * - * The caller must have the journal locked. */ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { @@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. - * - * Called with the journal lock held. */ - void jbd2_journal_write_revoke_records(journal_t *journal, transaction_t *transaction) { From a7b19448ddbdc34b2b8fedc048ba154ca798667b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Mar 2009 19:42:54 -0400 Subject: [PATCH 195/478] ext4: fix typo which causes a memory leak on error path This was found by smatch (http://repo.or.cz/w/smatch.git/) Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7f6fc41a2dde..5f3e3a3a38d6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2699,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - kfree(sbi->s_mb_maxs); + kfree(sbi->s_mb_offsets); return -ENOMEM; } From e7c9e3e99adf6c49c5d593a51375916acc039d1e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 27 Mar 2009 19:43:21 -0400 Subject: [PATCH 196/478] ext4: fix locking typo in mballoc which could cause soft lockup hangs Smatch (http://repo.or.cz/w/smatch.git/) complains about the locking in ext4_mb_add_n_trim() from fs/ext4/mballoc.c 4438 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 4439 pa_inode_list) { 4440 spin_lock(&tmp_pa->pa_lock); 4441 if (tmp_pa->pa_deleted) { 4442 spin_unlock(&pa->pa_lock); 4443 continue; 4444 } Brown paper bag time... Reported-by: Dan Carpenter Reviewed-by: Eric Sandeen Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5f3e3a3a38d6..f871677a7984 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4392,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) pa_inode_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { From 06705bff9114531a997a7d0c2520bea0f2927410 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 28 Mar 2009 10:59:57 -0400 Subject: [PATCH 197/478] ext4: Regularize mount options Add support for using the mount options "barrier" and "nobarrier", and "auto_da_alloc" and "noauto_da_alloc", which is more consistent than "barrier=<0|1>" or "auto_da_alloc=<0|1>". Most other ext3/ext4 mount options use the foo/nofoo naming convention. We allow the old forms of these mount options for backwards compatibility. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 25 +++++++++++++++++++++++-- fs/ext4/super.c | 30 ++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 5c484aec2bab..97882df04865 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -183,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata performance. barrier=<0|1(*)> This enables/disables the use of write barriers in - the jbd code. barrier=0 disables, barrier=1 enables. - This also requires an IO stack which can support +barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. +nobarrier This also requires an IO stack which can support barriers, and if jbd gets an error on a barrier write, it will disable again with a warning. Write barriers enforce proper on-disk ordering @@ -192,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in safe to use, at some performance penalty. If your disks are battery-backed in one way or another, disabling barriers may safely improve performance. + The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for + consistency with other ext4 mount options. inode_readahead=n This tuning parameter controls the maximum number of inode table blocks that ext4's inode @@ -313,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the a slightly higher priority than the default I/O priority. +auto_da_alloc(*) Many broken applications don't use fsync() when +noauto_da_alloc replacing existing files via patterns such as + fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, + fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). + If auto_da_alloc is enabled, ext4 will detect + the replace-via-rename and replace-via-truncate + patterns and force that any delayed allocation + blocks are allocated such that at the next + journal commit, in the default data=ordered + mode, the data blocks of the new file are forced + to disk before the rename() operation is + commited. This provides roughly the same level + of guarantees as ext3, and avoids the + "zero-length" problem that can happen when a + system crashes before the delayed allocation + blocks are forced to disk. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 45d07cf9df34..9987bba99db3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -867,7 +867,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",data_err=abort"); if (test_opt(sb, NO_AUTO_DA_ALLOC)) - seq_puts(seq, ",auto_da_alloc=0"); + seq_puts(seq, ",noauto_da_alloc"); ext4_show_quota_options(seq, sb); return 0; @@ -1018,7 +1018,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, @@ -1026,8 +1026,8 @@ enum { Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_i_version, + Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1080,6 +1080,8 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, @@ -1088,6 +1090,8 @@ static const match_table_t tokens = { {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_err, NULL}, }; @@ -1422,9 +1426,14 @@ set_qf_format: case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_nobarrier: + clear_opt(sbi->s_mount_opt, BARRIER); + break; case Opt_barrier: - if (match_int(&args[0], &option)) - return 0; + if (match_int(&args[0], &option)) { + set_opt(sbi->s_mount_opt, BARRIER); + break; + } if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1485,9 +1494,14 @@ set_qf_format: *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, option); break; + case Opt_noauto_da_alloc: + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + break; case Opt_auto_da_alloc: - if (match_int(&args[0], &option)) - return 0; + if (match_int(&args[0], &option)) { + clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + break; + } if (option) clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); else From 01522df346f846906eaf6ca57148641476209909 Mon Sep 17 00:00:00 2001 From: "Michael K. Johnson" Date: Fri, 27 Mar 2009 13:14:41 -0400 Subject: [PATCH 198/478] x86, setup: mark %esi as clobbered in E820 BIOS call Jordan Hargrave diagnosed a BIOS clobbering %esi in the E820 call. That particular BIOS has been fixed, but there is a possibility that this is responsible for other occasional reports of early boot failure, and it does not hurt to add %esi to the clobbers. -stable candidate patch. Cc: Justin Forbes Signed-off-by: Michael K Johnson Signed-off-by: H. Peter Anvin Cc: stable@kernel.org --- arch/x86/boot/memory.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 8c3c25f35578..a99dbbe77a0c 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -27,13 +27,14 @@ static int detect_memory_e820(void) do { size = sizeof(struct e820entry); - /* Important: %edx is clobbered by some BIOSes, - so it must be either used for the error output + /* Important: %edx and %esi are clobbered by some BIOSes, + so they must be either used for the error output or explicitly marked clobbered. */ asm("int $0x15; setc %0" : "=d" (err), "+b" (next), "=a" (id), "+c" (size), "=m" (*desc) - : "D" (desc), "d" (SMAP), "a" (0xe820)); + : "D" (desc), "d" (SMAP), "a" (0xe820) + : "esi"); /* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on From 776bd5c7a207de546918f805090bfc823d2660c8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:45:28 -0400 Subject: [PATCH 199/478] SUNRPC: Don't flag empty RPCB_GETADDR reply as bogus In 2007, commit e65fe3976f594603ed7b1b4a99d3e9b867f573ea added additional sanity checking to rpcb_decode_getaddr() to make sure we were getting a reply that was long enough to be an actual universal address. If the uaddr string isn't long enough, the XDR decoder returns EIO. However, an empty string is a valid RPCB_GETADDR response if the requested service isn't registered. Moreover, "::.n.m" is also a valid RPCB_GETADDR response for IPv6 addresses that is shorter than rpcb_decode_getaddr()'s lower limit of 11. So this sanity check introduced a regression for rpcbind requests against IPv6 remotes. So revert the lower bound check added by commit e65fe3976f594603ed7b1b4a99d3e9b867f573ea, and add an explicit check for an empty uaddr string, similar to libtirpc's rpcb_getaddr(3). Pointed-out-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 03ae007641e4..2caa7edeeaba 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -703,11 +703,16 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p, *portp = 0; addr_len = ntohl(*p++); + if (addr_len == 0) { + dprintk("RPC: rpcb_decode_getaddr: " + "service is not registered\n"); + return 0; + } + /* - * Simple sanity check. The smallest possible universal - * address is an IPv4 address string containing 11 bytes. + * Simple sanity check. */ - if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN) + if (addr_len > RPCBIND_MAXUADDRLEN) goto out_err; /* From efb3288b423d7e3533a68dccecaa05a56a281a4e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:45:43 -0400 Subject: [PATCH 200/478] SUNRPC: Clean up static inline functions in svc_xprt.h Clean up: Enable the use of const arguments in higher level svc_ APIs by adding const to the arguments of the helper functions in svc_xprt.h Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/svc_xprt.h | 46 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 0127daca4354..959b931b6053 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -88,29 +88,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt) kref_get(&xprt->xpt_ref); } static inline void svc_xprt_set_local(struct svc_xprt *xprt, - struct sockaddr *sa, int salen) + const struct sockaddr *sa, + const size_t salen) { memcpy(&xprt->xpt_local, sa, salen); xprt->xpt_locallen = salen; } static inline void svc_xprt_set_remote(struct svc_xprt *xprt, - struct sockaddr *sa, int salen) + const struct sockaddr *sa, + const size_t salen) { memcpy(&xprt->xpt_remote, sa, salen); xprt->xpt_remotelen = salen; } -static inline unsigned short svc_addr_port(struct sockaddr *sa) +static inline unsigned short svc_addr_port(const struct sockaddr *sa) { - unsigned short ret = 0; + const struct sockaddr_in *sin = (const struct sockaddr_in *)sa; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa; + switch (sa->sa_family) { case AF_INET: - ret = ntohs(((struct sockaddr_in *)sa)->sin_port); - break; + return ntohs(sin->sin_port); case AF_INET6: - ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port); - break; + return ntohs(sin6->sin6_port); } - return ret; + + return 0; } static inline size_t svc_addr_len(struct sockaddr *sa) @@ -124,36 +127,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa) return -EAFNOSUPPORT; } -static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt) +static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt) { - return svc_addr_port((struct sockaddr *)&xprt->xpt_local); + return svc_addr_port((const struct sockaddr *)&xprt->xpt_local); } -static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt) +static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt) { - return svc_addr_port((struct sockaddr *)&xprt->xpt_remote); + return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote); } -static inline char *__svc_print_addr(struct sockaddr *addr, - char *buf, size_t len) +static inline char *__svc_print_addr(const struct sockaddr *addr, + char *buf, const size_t len) { + const struct sockaddr_in *sin = (const struct sockaddr_in *)addr; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr; + switch (addr->sa_family) { case AF_INET: - snprintf(buf, len, "%pI4, port=%u", - &((struct sockaddr_in *)addr)->sin_addr, - ntohs(((struct sockaddr_in *) addr)->sin_port)); + snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr, + ntohs(sin->sin_port)); break; case AF_INET6: snprintf(buf, len, "%pI6, port=%u", - &((struct sockaddr_in6 *)addr)->sin6_addr, - ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); + &sin6->sin6_addr, + ntohs(sin6->sin6_port)); break; default: snprintf(buf, len, "unknown address type: %d", addr->sa_family); break; } + return buf; } #endif /* SUNRPC_SVC_XPRT_H */ From adbbe929569e6eec8ff9feca23f1f2b40b42853d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:45:51 -0400 Subject: [PATCH 201/478] NFSD: If port value written to /proc/fs/nfsd/portlist is invalid, return EINVAL Make sure port value read from user space by write_ports is valid before passing it to svc_find_xprt(). If it wasn't, the writer would get ENOENT instead of EINVAL. Noticed-by: J. Bruce Fields Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfsd/nfsctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3d93b2064ce5..5a936c14f6ff 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -938,6 +938,8 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) char transport[16]; int port; if (sscanf(buf, "%15s %4d", transport, &port) == 2) { + if (port < 1 || port > 65535) + return -EINVAL; err = nfsd_create_serv(); if (!err) { err = svc_create_xprt(nfsd_serv, @@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) char transport[16]; int port; if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { - if (port == 0) + if (port < 1 || port > 65535) return -EINVAL; if (nfsd_serv) { xprt = svc_find_xprt(nfsd_serv, transport, From 156e62094a74cf43f02f56ef96b6cda567501357 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:45:58 -0400 Subject: [PATCH 202/478] SUNRPC: Clean up svc_find_xprt() calling sequence Clean up: add documentating comment and use appropriate data types for svc_find_xprt()'s arguments. This also eliminates a mixed sign comparison: @port was an int, while the return value of svc_xprt_local_port() is an unsigned short. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc_xprt.c | 16 +++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 959b931b6053..55b68582c5d9 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -80,7 +80,8 @@ void svc_close_xprt(struct svc_xprt *xprt); void svc_delete_xprt(struct svc_xprt *xprt); int svc_port_is_privileged(struct sockaddr *sin); int svc_print_xprts(char *buf, int maxlen); -struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int); +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, + const sa_family_t af, const unsigned short port); int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen); static inline void svc_xprt_get(struct svc_xprt *xprt) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e588df5d6b34..c947c93dbc24 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1033,7 +1033,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) return dr; } -/* +/** + * svc_find_xprt - find an RPC transport instance + * @serv: pointer to svc_serv to search + * @xcl_name: C string containing transport's class name + * @af: Address family of transport's local address + * @port: transport's IP port number + * * Return the transport instance pointer for the endpoint accepting * connections/peer traffic from the specified transport class, * address family and port. @@ -1042,14 +1048,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) * wild-card, and will result in matching the first transport in the * service's list that has a matching class name. */ -struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, - int af, int port) +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, + const sa_family_t af, const unsigned short port) { struct svc_xprt *xprt; struct svc_xprt *found = NULL; /* Sanity check the args */ - if (!serv || !xcl_name) + if (serv == NULL || xcl_name == NULL) return found; spin_lock_bh(&serv->sv_lock); @@ -1058,7 +1064,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, continue; if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) continue; - if (port && port != svc_xprt_local_port(xprt)) + if (port != 0 && port != svc_xprt_local_port(xprt)) continue; found = xprt; svc_xprt_get(xprt); From 4b62e58cccff9c5e7ffc7023f7ec24c75fbd549b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:06 -0400 Subject: [PATCH 203/478] SUNRPC: Pass a family argument to svc_register() The sv_family field is going away. Instead of using sv_family, have the svc_register() function take a protocol family argument. Since this argument represents a protocol family, and not an address family, this argument takes an int, as this is what is passed to sock_create_kern(). Also make sure svc_register's helpers are checking for PF_FOO instead of AF_FOO. The value of [AP]F_FOO are equivalent; this is simply a symbolic change to reflect the semantics of the value stored in that variable. sock_create_kern() should return EPFNOSUPPORT if the passed-in protocol family isn't supported, but it uses EAFNOSUPPORT for this case. We will stick with that tradition here, as svc_register() is called by the RPC server in the same path as sock_create_kern(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/svc.h | 4 ++-- net/sunrpc/svc.c | 21 +++++++++++---------- net/sunrpc/svcsock.c | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3435d24bfe55..1f18fc728cba 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -396,8 +396,8 @@ struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); -int svc_register(const struct svc_serv *, const unsigned short, - const unsigned short); +int svc_register(const struct svc_serv *, const int, + const unsigned short, const unsigned short); void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c51fed4d1af1..41bc36ea2224 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -800,17 +800,17 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, * if any error occurs. */ static int __svc_register(const u32 program, const u32 version, - const sa_family_t family, + const int family, const unsigned short protocol, const unsigned short port) { int error; switch (family) { - case AF_INET: + case PF_INET: return __svc_rpcb_register4(program, version, protocol, port); - case AF_INET6: + case PF_INET6: error = __svc_rpcb_register6(program, version, protocol, port); if (error < 0) @@ -840,11 +840,11 @@ static int __svc_register(const u32 program, const u32 version, * if any error occurs. */ static int __svc_register(const u32 program, const u32 version, - sa_family_t family, + const int family, const unsigned short protocol, const unsigned short port) { - if (family != AF_INET) + if (family != PF_INET) return -EAFNOSUPPORT; return rpcb_register(program, version, protocol, port); @@ -855,13 +855,14 @@ static int __svc_register(const u32 program, const u32 version, /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register + * @family: protocol family of service's listener socket * @proto: transport protocol number to advertise * @port: port to advertise * - * Service is registered for any address in serv's address family + * Service is registered for any address in the passed-in protocol family */ -int svc_register(const struct svc_serv *serv, const unsigned short proto, - const unsigned short port) +int svc_register(const struct svc_serv *serv, const int family, + const unsigned short proto, const unsigned short port) { struct svc_program *progp; unsigned int i; @@ -879,7 +880,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto, i, proto == IPPROTO_UDP? "udp" : "tcp", port, - serv->sv_family, + family, progp->pg_vers[i]->vs_hidden? " (but not telling portmap)" : ""); @@ -887,7 +888,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto, continue; error = __svc_register(progp->pg_prog, i, - serv->sv_family, proto, port); + family, proto, port); if (error < 0) break; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5763e6460fea..d00583c1cd04 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Register socket with portmapper */ if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, inet->sk_protocol, + *errp = svc_register(serv, serv->sv_family, inet->sk_protocol, ntohs(inet_sk(inet)->sport)); if (*errp < 0) { From baf01caf09e87579c2d157e5ee29975db8551522 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:13 -0400 Subject: [PATCH 204/478] SUNRPC: svc_setup_socket() gets protocol family from socket Since the sv_family field is going away, modify svc_setup_socket() to extract the protocol family from the passed-in socket instead of from the passed-in svc_serv struct. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svcsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d00583c1cd04..d00bc3307745 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Register socket with portmapper */ if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, serv->sv_family, inet->sk_protocol, + *errp = svc_register(serv, inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->sport)); if (*errp < 0) { @@ -1145,13 +1145,13 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* * We start one listener per sv_serv. We want AF_INET - * requests to be automatically shunted to our AF_INET6 + * requests to be automatically shunted to our PF_INET6 * listener using a mapped IPv4 address. Make sure * no-one starts an equivalent IPv4 listener, which * would steal our incoming connections. */ val = 0; - if (serv->sv_family == AF_INET6) + if (inet->sk_family == PF_INET6) kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, (char *)&val, sizeof(val)); From 9652ada3fb5914a67d8422114e8a76388330fa79 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:21 -0400 Subject: [PATCH 205/478] SUNRPC: Change svc_create_xprt() to take a @family argument The sv_family field is going away. Pass a protocol family argument to svc_create_xprt() instead of extracting the family from the passed-in svc_serv struct. Again, as this is a listener socket and not an address, we make this new argument an "int" protocol family, instead of an "sa_family_t." Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/svc.c | 3 ++- fs/nfs/callback.c | 4 ++-- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 4 ++-- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc_xprt.c | 15 +++++++++------ 6 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 64f1c31b5853..390c5593655c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -211,7 +211,8 @@ static int create_lockd_listener(struct svc_serv *serv, char *name, xprt = svc_find_xprt(serv, name, 0, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); + return svc_create_xprt(serv, name, nlmsvc_family, + port, SVC_SOCK_DEFAULTS); svc_xprt_put(xprt); return 0; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 3e634f2a1083..fb35cab63c8a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -122,8 +122,8 @@ int nfs_callback_up(void) if (!serv) goto out_err; - ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, - SVC_SOCK_ANONYMOUS); + ret = svc_create_xprt(serv, "tcp", nfs_callback_family, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; nfs_callback_tcpport = ret; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5a936c14f6ff..a4ed8644d69c 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -943,7 +943,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) err = nfsd_create_serv(); if (!err) { err = svc_create_xprt(nfsd_serv, - transport, port, + transport, PF_INET, port, SVC_SOCK_ANONYMOUS); if (err == -ENOENT) /* Give a reasonable perror msg for diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 07e4f5d7baa8..ab7f249055b5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -244,7 +244,7 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@ -253,7 +253,7 @@ static int nfsd_init_socks(int port) if (error < 0) return error; - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 55b68582c5d9..0d9cb6ef28b0 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -71,7 +71,8 @@ int svc_reg_xprt_class(struct svc_xprt_class *); void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, struct svc_serv *); -int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); +int svc_create_xprt(struct svc_serv *, const char *, const int, + const unsigned short, int); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_received(struct svc_xprt *); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c947c93dbc24..2819ee093f36 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -161,7 +161,9 @@ EXPORT_SYMBOL_GPL(svc_xprt_init); static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct svc_serv *serv, - unsigned short port, int flags) + const int family, + const unsigned short port, + int flags) { struct sockaddr_in sin = { .sin_family = AF_INET, @@ -176,12 +178,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct sockaddr *sap; size_t len; - switch (serv->sv_family) { - case AF_INET: + switch (family) { + case PF_INET: sap = (struct sockaddr *)&sin; len = sizeof(sin); break; - case AF_INET6: + case PF_INET6: sap = (struct sockaddr *)&sin6; len = sizeof(sin6); break; @@ -192,7 +194,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return xcl->xcl_ops->xpo_create(serv, sap, len, flags); } -int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, +int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, + const int family, const unsigned short port, int flags) { struct svc_xprt_class *xcl; @@ -209,7 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = __svc_xpo_create(xcl, serv, port, flags); + newxprt = __svc_xpo_create(xcl, serv, family, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); From 49a9072f29a1039f142ec98b44a72d7173651c02 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:29 -0400 Subject: [PATCH 206/478] SUNRPC: Remove @family argument from svc_create() and svc_create_pooled() Since an RPC service listener's protocol family is specified now via svc_create_xprt(), it no longer needs to be passed to svc_create() or svc_create_pooled(). Remove that argument from the synopsis of those functions, and remove the sv_family field from the svc_serv struct. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/svc.c | 2 +- fs/nfs/callback.c | 3 +-- fs/nfsd/nfssvc.c | 1 - include/linux/sunrpc/svc.h | 5 ++--- net/sunrpc/svc.c | 11 +++++------ 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 390c5593655c..d30920038cb6 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -275,7 +275,7 @@ int lockd_up(void) "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index fb35cab63c8a..ddf4b4ae6967 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -116,8 +116,7 @@ int nfs_callback_up(void) mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, - nfs_callback_family, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); ret = -ENOMEM; if (!serv) goto out_err; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ab7f249055b5..bc3567bab8c4 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -229,7 +229,6 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 1f18fc728cba..d3a4c0231933 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -69,7 +69,6 @@ struct svc_serv { struct list_head sv_tempsocks; /* all temporary sockets */ int sv_tmpcnt; /* count of temporary sockets */ struct timer_list sv_temptimer; /* timer for aging temporary sockets */ - sa_family_t sv_family; /* listener's address family */ char * sv_name; /* service name */ @@ -385,13 +384,13 @@ struct svc_procedure { /* * Function prototypes. */ -struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t, +struct svc_serv *svc_create(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv *)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, - sa_family_t, void (*shutdown)(struct svc_serv *), + void (*shutdown)(struct svc_serv *), svc_thread_fn, struct module *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 41bc36ea2224..d72ff44826d8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - sa_family_t family, void (*shutdown)(struct svc_serv *serv)) + void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; unsigned int vers; @@ -368,7 +368,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; - serv->sv_family = family; serv->sv_name = prog->pg_name; serv->sv_program = prog; serv->sv_nrthreads = 1; @@ -427,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - sa_family_t family, void (*shutdown)(struct svc_serv *serv)) + void (*shutdown)(struct svc_serv *serv)) { - return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, shutdown); } EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - sa_family_t family, void (*shutdown)(struct svc_serv *serv), + void (*shutdown)(struct svc_serv *serv), svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, family, shutdown); + serv = __svc_create(prog, bufsize, npools, shutdown); if (serv != NULL) { serv->sv_function = func; From 26298caacac3e4754194b13aef377706d5de6cf6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:36 -0400 Subject: [PATCH 207/478] NFS: Revert creation of IPv6 listeners for lockd and NFSv4 callbacks We're about to convert over to using separate PF_INET and PF_INET6 listeners, instead of a single PF_INET6 listener that also receives AF_INET requests and maps them to AF_INET6. Clear the way by removing the logic in lockd and the NFSv4 callback server that creates an AF_INET6 service listener. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/svc.c | 13 +------------ fs/nfs/callback.c | 14 ++------------ 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index d30920038cb6..566932b98fd3 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -52,17 +52,6 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; -/* - * If the kernel has IPv6 support available, always listen for - * both AF_INET and AF_INET6 requests. - */ -#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \ - defined(CONFIG_SUNRPC_REGISTER_V4) -static const sa_family_t nlmsvc_family = AF_INET6; -#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ -static const sa_family_t nlmsvc_family = AF_INET; -#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ - /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 @@ -211,7 +200,7 @@ static int create_lockd_listener(struct svc_serv *serv, char *name, xprt = svc_find_xprt(serv, name, 0, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, nlmsvc_family, + return svc_create_xprt(serv, name, PF_INET, port, SVC_SOCK_DEFAULTS); svc_xprt_put(xprt); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index ddf4b4ae6967..0ef47dff89be 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -41,16 +41,6 @@ unsigned short nfs_callback_tcpport; static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; -/* - * If the kernel has IPv6 support available, always listen for - * both AF_INET and AF_INET6 requests. - */ -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static const sa_family_t nfs_callback_family = AF_INET6; -#else -static const sa_family_t nfs_callback_family = AF_INET; -#endif - static int param_set_port(const char *val, struct kernel_param *kp) { char *endp; @@ -121,13 +111,13 @@ int nfs_callback_up(void) if (!serv) goto out_err; - ret = svc_create_xprt(serv, "tcp", nfs_callback_family, + ret = svc_create_xprt(serv, "tcp", PF_INET, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; nfs_callback_tcpport = ret; dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport, nfs_callback_family); + nfs_callback_tcpport, PF_INET); nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); if (IS_ERR(nfs_callback_info.rqst)) { From 7d21c0f9845f0ce4e81baac3519fbb2c6c2cc908 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:44 -0400 Subject: [PATCH 208/478] SUNRPC: Set IPV6ONLY flag on PF_INET6 RPC listener sockets We are about to convert to using separate RPC listener sockets for PF_INET and PF_INET6. This echoes the way IPv6 is handled in user space by TI-RPC, and eliminates the need for ULPs to worry about mapped IPv4 AF_INET6 addresses when doing address comparisons. Start by setting the IPV6ONLY flag on PF_INET6 RPC listener sockets. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svcsock.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d00bc3307745..ac6cd65220c7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1144,13 +1144,11 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svc_tcp_init(svsk, serv); /* - * We start one listener per sv_serv. We want AF_INET - * requests to be automatically shunted to our PF_INET6 - * listener using a mapped IPv4 address. Make sure - * no-one starts an equivalent IPv4 listener, which - * would steal our incoming connections. + * If this is a PF_INET6 listener, we want to avoid + * getting requests from IPv4 remotes. Those should + * be shunted to a PF_INET listener via rpcbind. */ - val = 0; + val = 1; if (inet->sk_family == PF_INET6) kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, (char *)&val, sizeof(val)); From fc28decdc93633a65d54e42498e9e819d466329c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:51 -0400 Subject: [PATCH 209/478] SUNRPC: Use IPv4 loopback for registering AF_INET6 kernel RPC services The kernel uses an IPv6 loopback address when registering its AF_INET6 RPC services so that it can tell whether the local portmapper is actually IPv6-enabled. Since the legacy portmapper doesn't listen on IPv6, however, this causes a long timeout on older systems if the kernel happens to try creating and registering an AF_INET6 RPC service. Originally I wanted to use a connected transport (either TCP or connected UDP) so that the upcall would fail immediately if the portmapper wasn't listening on IPv6, but we never agreed on what transport to use. In the end, it's of little consequence to the kernel whether the local portmapper is listening on IPv6. It's only important whether the portmapper supports rpcbind v4. And the kernel can't tell that at all if it is sending requests via IPv6 -- the portmapper will just ignore them. So, send both rpcbind v2 and v4 SET/UNSET requests via IPv4 loopback to maintain better backwards compatibility between new kernels and legacy user space, and prevent multi-second hangs in some cases when the kernel attempts to register RPC services. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 2caa7edeeaba..ebce7a5976c9 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -124,12 +124,6 @@ static const struct sockaddr_in rpcb_inaddr_loopback = { .sin_port = htons(RPCBIND_PORT), }; -static const struct sockaddr_in6 rpcb_in6addr_loopback = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - .sin6_port = htons(RPCBIND_PORT), -}; - static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, size_t addrlen, u32 version) { @@ -176,9 +170,10 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return rpc_create(&args); } -static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, - u32 version, struct rpc_message *msg) +static int rpcb_register_call(u32 version, struct rpc_message *msg) { + struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback; + size_t addrlen = sizeof(rpcb_inaddr_loopback); struct rpc_clnt *rpcb_clnt; int result, error = 0; @@ -254,9 +249,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) if (port) msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; - return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, - sizeof(rpcb_inaddr_loopback), - RPCBVERS_2, &msg); + return rpcb_register_call(RPCBVERS_2, &msg); } /* @@ -284,9 +277,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, - sizeof(rpcb_inaddr_loopback), - RPCBVERS_4, msg); + return rpcb_register_call(RPCBVERS_4, msg); } /* @@ -318,9 +309,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, - sizeof(rpcb_in6addr_loopback), - RPCBVERS_4, msg); + return rpcb_register_call(RPCBVERS_4, msg); } /** From ba5c35e0c7e30b095636cd58b0854fdbd3c32947 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:59 -0400 Subject: [PATCH 210/478] SUNRPC: Don't return EPROTONOSUPPORT in svc_register()'s helpers The RPC client returns -EPROTONOSUPPORT if there is a protocol version mismatch (ie the remote RPC server doesn't support the RPC protocol version sent by the client). Helpers for the svc_register() function return -EPROTONOSUPPORT if they don't recognize the passed-in IPPROTO_ value. These are two entirely different failure modes. Have the helpers return -ENOPROTOOPT instead of -EPROTONOSUPPORT. This will allow callers to determine more precisely what the underlying problem is, and decide to report or recover appropriately. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d72ff44826d8..17e0d7265dfd 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -749,7 +749,7 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, netid = RPCBIND_NETID_TCP; break; default: - return -EPROTONOSUPPORT; + return -ENOPROTOOPT; } return rpcb_v4_register(program, version, @@ -785,7 +785,7 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, netid = RPCBIND_NETID_TCP6; break; default: - return -EPROTONOSUPPORT; + return -ENOPROTOOPT; } return rpcb_v4_register(program, version, From 3aba45536fe8f92aa07bcdfd2fb1cf17eec7d786 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:06 -0400 Subject: [PATCH 211/478] SUNRPC: Clean up address type casts in rpcb_v4_register() Clean up: Simplify rpcb_v4_register() and its helpers by moving the details of sockaddr type casting to rpcb_v4_register()'s helper functions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index ebce7a5976c9..44d0732ba874 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -170,7 +170,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return rpc_create(&args); } -static int rpcb_register_call(u32 version, struct rpc_message *msg) +static int rpcb_register_call(const u32 version, struct rpc_message *msg) { struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback; size_t addrlen = sizeof(rpcb_inaddr_loopback); @@ -255,17 +255,17 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) /* * Fill in AF_INET family-specific arguments to register */ -static int rpcb_register_netid4(struct sockaddr_in *address_to_register, +static int rpcb_register_netid4(const struct sockaddr *sap, struct rpc_message *msg) { + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; struct rpcbind_args *map = msg->rpc_argp; - unsigned short port = ntohs(address_to_register->sin_port); + unsigned short port = ntohs(sin->sin_port); char buf[32]; /* Construct AF_INET universal address */ snprintf(buf, sizeof(buf), "%pI4.%u.%u", - &address_to_register->sin_addr.s_addr, - port >> 8, port & 0xff); + &sin->sin_addr.s_addr, port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -283,21 +283,21 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, /* * Fill in AF_INET6 family-specific arguments to register */ -static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, +static int rpcb_register_netid6(const struct sockaddr *sap, struct rpc_message *msg) { + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; struct rpcbind_args *map = msg->rpc_argp; - unsigned short port = ntohs(address_to_register->sin6_port); + unsigned short port = ntohs(sin6->sin6_port); char buf[64]; /* Construct AF_INET6 universal address */ - if (ipv6_addr_any(&address_to_register->sin6_addr)) + if (ipv6_addr_any(&sin6->sin6_addr)) snprintf(buf, sizeof(buf), "::.%u.%u", port >> 8, port & 0xff); else snprintf(buf, sizeof(buf), "%pI6.%u.%u", - &address_to_register->sin6_addr, - port >> 8, port & 0xff); + &sin6->sin6_addr, port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -369,11 +369,9 @@ int rpcb_v4_register(const u32 program, const u32 version, switch (address->sa_family) { case AF_INET: - return rpcb_register_netid4((struct sockaddr_in *)address, - &msg); + return rpcb_register_netid4(address, &msg); case AF_INET6: - return rpcb_register_netid6((struct sockaddr_in6 *)address, - &msg); + return rpcb_register_netid6(address, &msg); } return -EAFNOSUPPORT; From 126e4bc3b3b446482696377f67a634c76eaf2e9c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:14 -0400 Subject: [PATCH 212/478] SUNRPC: rpcbind actually interprets r_owner string RFC 1833 has little to say about the contents of r_owner; it only specifies that it is a string, and states that it is used to control who can UNSET an entry. Our port of rpcbind (from Sun) assumes this string contains a numeric UID value, not alphabetical or symbolic characters, but checks this value only for AF_LOCAL RPCB_SET or RPCB_UNSET requests. In all other cases, rpcbind ignores the contents of the r_owner string. The reference user space implementation of rpcb_set(3) uses a numeric UID for all SET/UNSET requests (even via the network) and an empty string for all other requests. We emulate that behavior here to maintain bug-for-bug compatibility. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 44d0732ba874..d550d0b967db 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -63,9 +63,16 @@ enum { * r_owner * * The "owner" is allowed to unset a service in the rpcbind database. - * We always use the following (arbitrary) fixed string. + * + * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a + * UID which it maps to a local user name via a password lookup. + * In all other cases it is ignored. + * + * For SET/UNSET requests, user space provides a value, even for + * network requests, and GETADDR uses an empty string. We follow + * those precedents here. */ -#define RPCB_OWNER_STRING "rpcb" +#define RPCB_OWNER_STRING "0" #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) static void rpcb_getport_done(struct rpc_task *, void *); @@ -566,7 +573,7 @@ void rpcb_getport_async(struct rpc_task *task) map->r_xprt = xprt_get(xprt); map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR); - map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ + map->r_owner = ""; map->r_status = -EIO; child = rpcb_call_async(rpcb_clnt, map, proc); From 1673d0de40ab46cac3b456ad50e1c8d6a31bfd66 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:21 -0400 Subject: [PATCH 213/478] SUNRPC: Allow callers to pass rpcb_v4_register a NULL address The user space TI-RPC library uses an empty string for the universal address when unregistering all target addresses for [program, version]. The kernel's rpcb client should behave the same way. Here, we are switching between several registration methods based on the protocol family of the incoming address. Rename the other rpcbind v4 registration functions to make it clear that they, as well, are switched on protocol family. In /etc/netconfig, this is either "inet" or "inet6". NB: The loopback protocol families are not supported in the kernel. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index d550d0b967db..8ea8907d4b8d 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -262,8 +262,8 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) /* * Fill in AF_INET family-specific arguments to register */ -static int rpcb_register_netid4(const struct sockaddr *sap, - struct rpc_message *msg) +static int rpcb_register_inet4(const struct sockaddr *sap, + struct rpc_message *msg) { const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; struct rpcbind_args *map = msg->rpc_argp; @@ -290,8 +290,8 @@ static int rpcb_register_netid4(const struct sockaddr *sap, /* * Fill in AF_INET6 family-specific arguments to register */ -static int rpcb_register_netid6(const struct sockaddr *sap, - struct rpc_message *msg) +static int rpcb_register_inet6(const struct sockaddr *sap, + struct rpc_message *msg) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; struct rpcbind_args *map = msg->rpc_argp; @@ -319,6 +319,20 @@ static int rpcb_register_netid6(const struct sockaddr *sap, return rpcb_register_call(RPCBVERS_4, msg); } +static int rpcb_unregister_all_protofamilies(struct rpc_message *msg) +{ + struct rpcbind_args *map = msg->rpc_argp; + + dprintk("RPC: unregistering [%u, %u, '%s'] with " + "local rpcbind\n", + map->r_prog, map->r_vers, map->r_netid); + + map->r_addr = ""; + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; + + return rpcb_register_call(RPCBVERS_4, msg); +} + /** * rpcb_v4_register - set or unset a port registration with the local rpcbind * @program: RPC program number of service to (un)register @@ -336,10 +350,11 @@ static int rpcb_register_netid6(const struct sockaddr *sap, * invoke this function once for each [program, version, address, * netid] tuple they wish to advertise. * - * Callers may also unregister RPC services that are no longer - * available by setting the port number in the passed-in address - * to zero. Callers pass a netid of "" to unregister all - * transport netids associated with [program, version, address]. + * Callers may also unregister RPC services that are registered at a + * specific address by setting the port number in @address to zero. + * They may unregister all registered protocol families at once for + * a service by passing a NULL @address argument. If @netid is "" + * then all netids for [program, version, address] are unregistered. * * This function uses rpcbind protocol version 4 to contact the * local rpcbind daemon. The local rpcbind daemon must support @@ -374,11 +389,14 @@ int rpcb_v4_register(const u32 program, const u32 version, .rpc_argp = &map, }; + if (address == NULL) + return rpcb_unregister_all_protofamilies(&msg); + switch (address->sa_family) { case AF_INET: - return rpcb_register_netid4(address, &msg); + return rpcb_register_inet4(address, &msg); case AF_INET6: - return rpcb_register_netid6(address, &msg); + return rpcb_register_inet6(address, &msg); } return -EAFNOSUPPORT; From d5a8620f7c8a5bcade730e2fa1224191f289fb00 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:29 -0400 Subject: [PATCH 214/478] SUNRPC: Simplify svc_unregister() Our initial implementation of svc_unregister() assumed that PMAP_UNSET cleared all rpcbind registrations for a [program, version] tuple. However, we now have evidence that PMAP_UNSET clears only "inet" entries, and not "inet6" entries, in the rpcbind database. For backwards compatibility with the legacy portmapper, the svc_unregister() function also must work if user space doesn't support rpcbind version 4 at all. Thus we'll send an rpcbind v4 UNSET, and if that fails, we'll send a PMAP_UNSET. This simplifies the code in svc_unregister() and provides better backwards compatibility with legacy user space that does not support rpcbind version 4. We can get rid of the conditional compilation in here as well. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 17e0d7265dfd..bd0ee312dac9 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -896,38 +896,31 @@ int svc_register(const struct svc_serv *serv, const int family, return error; } -#ifdef CONFIG_SUNRPC_REGISTER_V4 - -static void __svc_unregister(const u32 program, const u32 version, - const char *progname) -{ - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_ANY_INIT, - .sin6_port = 0, - }; - int error; - - error = rpcb_v4_register(program, version, - (struct sockaddr *)&sin6, ""); - dprintk("svc: %s(%sv%u), error %d\n", - __func__, progname, version, error); -} - -#else /* CONFIG_SUNRPC_REGISTER_V4 */ - +/* + * If user space is running rpcbind, it should take the v4 UNSET + * and clear everything for this [program, version]. If user space + * is running portmap, it will reject the v4 UNSET, but won't have + * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient + * in this case to clear all existing entries for [program, version]. + */ static void __svc_unregister(const u32 program, const u32 version, const char *progname) { int error; - error = rpcb_register(program, version, 0, 0); + error = rpcb_v4_register(program, version, NULL, ""); + + /* + * User space didn't support rpcbind v4, so retry this + * request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, 0, 0); + dprintk("svc: %s(%sv%u), error %d\n", __func__, progname, version, error); } -#endif /* CONFIG_SUNRPC_REGISTER_V4 */ - /* * All netids, bind addresses and ports registered for [program, version] * are removed from the local rpcbind database (if the service is not From cadc0fa534e51e20fdffe1623913c163a18d71b1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:36 -0400 Subject: [PATCH 215/478] SUNRPC: Simplify kernel RPC service registration The kernel registers RPC services with the local portmapper with an rpcbind SET upcall to the local portmapper. Traditionally, this used rpcbind v2 (PMAP), but registering RPC services that support IPv6 requires rpcbind v3 or v4. Since we now want separate PF_INET and PF_INET6 listeners for each kernel RPC service, svc_register() will do only one of those registrations at a time. For PF_INET, it tries an rpcb v4 SET upcall first; if that fails, it does a legacy portmap SET. This makes it entirely backwards compatible with legacy user space, but allows a proper v4 SET to be used if rpcbind is available. For PF_INET6, it does an rpcb v4 SET upcall. If that fails, it fails the registration, and thus the transport creation. This let's the kernel detect if user space is able to support IPv6 RPC services, and thus whether it should maintain a PF_INET6 listener for each service at all. This provides complete backwards compatibilty with legacy user space that only supports rpcbind v2. The only down-side is that registering a new kernel RPC service may take an extra exchange with the local portmapper on legacy systems, but this is an infrequent operation and is done over UDP (no lingering sockets in TIMEWAIT), so it shouldn't be consequential. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 79 +++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index bd0ee312dac9..142f64745fba 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -718,8 +718,6 @@ svc_exit_thread(struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_exit_thread); -#ifdef CONFIG_SUNRPC_REGISTER_V4 - /* * Register an "inet" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@ -734,12 +732,13 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { - struct sockaddr_in sin = { + const struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; - char *netid; + const char *netid; + int error; switch (protocol) { case IPPROTO_UDP: @@ -752,10 +751,20 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, return -ENOPROTOOPT; } - return rpcb_v4_register(program, version, - (struct sockaddr *)&sin, netid); + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin, netid); + + /* + * User space didn't support rpcbind v4, so retry this + * registration request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, protocol, port); + + return error; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* * Register an "inet6" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@ -770,12 +779,13 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { - struct sockaddr_in6 sin6 = { + const struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; - char *netid; + const char *netid; + int error; switch (protocol) { case IPPROTO_UDP: @@ -788,9 +798,19 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, return -ENOPROTOOPT; } - return rpcb_v4_register(program, version, - (struct sockaddr *)&sin6, netid); + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin6, netid); + + /* + * User space didn't support rpcbind version 4, so we won't + * use a PF_INET6 listener. + */ + if (error == -EPROTONOSUPPORT) + error = -EAFNOSUPPORT; + + return error; } +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ /* * Register a kernel RPC service via rpcbind version 4. @@ -809,48 +829,17 @@ static int __svc_register(const u32 program, const u32 version, case PF_INET: return __svc_rpcb_register4(program, version, protocol, port); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: - error = __svc_rpcb_register6(program, version, + return__svc_rpcb_register6(program, version, protocol, port); - if (error < 0) - return error; - - /* - * Work around bug in some versions of Linux rpcbind - * which don't allow registration of both inet and - * inet6 netids. - * - * Error return ignored for now. - */ - __svc_rpcb_register4(program, version, - protocol, port); - return 0; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ } return -EAFNOSUPPORT; } -#else /* CONFIG_SUNRPC_REGISTER_V4 */ - -/* - * Register a kernel RPC service via rpcbind version 2. - * - * Returns zero on success; a negative errno value is returned - * if any error occurs. - */ -static int __svc_register(const u32 program, const u32 version, - const int family, - const unsigned short protocol, - const unsigned short port) -{ - if (family != PF_INET) - return -EAFNOSUPPORT; - - return rpcb_register(program, version, protocol, port); -} - -#endif /* CONFIG_SUNRPC_REGISTER_V4 */ - /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register From 363f724cdd3d2ae554e261be995abdeb15f7bdd9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:44 -0400 Subject: [PATCH 216/478] SUNRPC: rpcb_register() should handle errors silently Move error reporting for RPC registration to rpcb_register's caller. This way the caller can choose to recover silently from certain errors, but report errors it does not recognize. Error reporting for kernel RPC service registration is now handled in one place. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 2 +- net/sunrpc/svc.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 8ea8907d4b8d..beee6da33035 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -194,7 +194,7 @@ static int rpcb_register_call(const u32 version, struct rpc_message *msg) error = PTR_ERR(rpcb_clnt); if (error < 0) { - printk(KERN_WARNING "RPC: failed to contact local rpcbind " + dprintk("RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); return error; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 142f64745fba..8ba654bdd608 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -818,26 +818,30 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, * Returns zero on success; a negative errno value is returned * if any error occurs. */ -static int __svc_register(const u32 program, const u32 version, +static int __svc_register(const char *progname, + const u32 program, const u32 version, const int family, const unsigned short protocol, const unsigned short port) { - int error; + int error = -EAFNOSUPPORT; switch (family) { case PF_INET: - return __svc_rpcb_register4(program, version, + error = __svc_rpcb_register4(program, version, protocol, port); break; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: - return__svc_rpcb_register6(program, version, + error = __svc_rpcb_register6(program, version, protocol, port); #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ } - return -EAFNOSUPPORT; + if (error < 0) + printk(KERN_WARNING "svc: failed to register %sv%u RPC " + "service (errno %d).\n", progname, version, -error); + return error; } /** @@ -875,8 +879,8 @@ int svc_register(const struct svc_serv *serv, const int family, if (progp->pg_vers[i]->vs_hidden) continue; - error = __svc_register(progp->pg_prog, i, - family, proto, port); + error = __svc_register(progp->pg_name, progp->pg_prog, + i, family, proto, port); if (error < 0) break; } From 9355982830ad67dca35e0f3d43319f3d438f82b4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:51 -0400 Subject: [PATCH 217/478] SUNRPC: Remove CONFIG_SUNRPC_REGISTER_V4 We just augmented the kernel's RPC service registration code so that it automatically adjusts to what is supported in user space. Thus we no longer need the kernel configuration option to enable registering RPC services with v4 -- it's all done automatically. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/Kconfig | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 5592883e1e4a..afd91c78ce8e 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -17,28 +17,6 @@ config SUNRPC_XPRT_RDMA If unsure, say N. -config SUNRPC_REGISTER_V4 - bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL - default n - help - Sun added support for registering RPC services at an IPv6 - address by creating two new versions of the rpcbind protocol - (RFC 1833). - - This option enables support in the kernel RPC server for - registering kernel RPC services via version 4 of the rpcbind - protocol. If you enable this option, you must run a portmapper - daemon that supports rpcbind protocol version 4. - - Serving NFS over IPv6 from knfsd (the kernel's NFS server) - requires that you enable this option and use a portmapper that - supports rpcbind version 4. - - If unsure, say N to get traditional behavior (register kernel - RPC services using only rpcbind version 2). Distributions - using the legacy Linux portmapper daemon must say N here. - config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL From eb16e907781a9da7f272a3e8284c26bc4e4aeb9d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:59 -0400 Subject: [PATCH 218/478] lockd: Start PF_INET6 listener only if IPv6 support is available Apparently a lot of people need to disable IPv6 completely on their distributor-built systems, which have CONFIG_IPV6_MODULE enabled at build time. They do this by blacklisting the ipv6.ko module. This causes the creation of the lockd service listener to fail if CONFIG_IPV6_MODULE is set, but the module cannot be loaded. Now that the kernel's PF_INET6 RPC listeners are completely separate from PF_INET listeners, we can always start PF_INET. Then lockd can try to start PF_INET6, but it isn't required to be available. Note this has the added benefit that NLM callbacks from AF_INET6 servers will never come from AF_INET remotes. We no longer have to worry about matching mapped IPv4 addresses to AF_INET when comparing addresses. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/clntlock.c | 51 +-------------------------------------------- fs/lockd/svc.c | 30 ++++++++++++++++++-------- 2 files changed, 22 insertions(+), 59 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index aedc47a264c1..1f3b0fc0d351 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) return 0; } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap, - struct in6_addr *addr_mapped) -{ - const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; - - switch (sap->sa_family) { - case AF_INET6: - return &((const struct sockaddr_in6 *)sap)->sin6_addr; - case AF_INET: - ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped); - return addr_mapped; - } - - return NULL; -} - -/* - * If lockd is using a PF_INET6 listener, all incoming requests appear - * to come from AF_INET6 remotes. The address of AF_INET remotes are - * mapped to AF_INET6 automatically by the network layer. In case the - * user passed an AF_INET server address at mount time, ensure both - * addresses are AF_INET6 before comparing them. - */ -static int nlmclnt_cmp_addr(const struct nlm_host *host, - const struct sockaddr *sap) -{ - const struct in6_addr *addr1; - const struct in6_addr *addr2; - struct in6_addr addr1_mapped; - struct in6_addr addr2_mapped; - - addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped); - if (likely(addr1 != NULL)) { - addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped); - if (likely(addr2 != NULL)) - return ipv6_addr_equal(addr1, addr2); - } - - return 0; -} -#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ -static int nlmclnt_cmp_addr(const struct nlm_host *host, - const struct sockaddr *sap) -{ - return nlm_cmp_addr(nlm_addr(host), sap); -} -#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ - /* * The server lockd has called us back to tell us the lock was granted */ @@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlmclnt_cmp_addr(block->b_host, addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 566932b98fd3..abf83881f68a 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -193,20 +193,30 @@ lockd(void *vrqstp) return 0; } -static int create_lockd_listener(struct svc_serv *serv, char *name, - unsigned short port) +static int create_lockd_listener(struct svc_serv *serv, const char *name, + const int family, const unsigned short port) { struct svc_xprt *xprt; - xprt = svc_find_xprt(serv, name, 0, 0); + xprt = svc_find_xprt(serv, name, family, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, PF_INET, - port, SVC_SOCK_DEFAULTS); - + return svc_create_xprt(serv, name, family, port, + SVC_SOCK_DEFAULTS); svc_xprt_put(xprt); return 0; } +static int create_lockd_family(struct svc_serv *serv, const int family) +{ + int err; + + err = create_lockd_listener(serv, "udp", family, nlm_udpport); + if (err < 0) + return err; + + return create_lockd_listener(serv, "tcp", family, nlm_tcpport); +} + /* * Ensure there are active UDP and TCP listeners for lockd. * @@ -222,13 +232,15 @@ static int make_socks(struct svc_serv *serv) static int warned; int err; - err = create_lockd_listener(serv, "udp", nlm_udpport); + err = create_lockd_family(serv, PF_INET); if (err < 0) goto out_err; - err = create_lockd_listener(serv, "tcp", nlm_tcpport); - if (err < 0) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + err = create_lockd_family(serv, PF_INET6); + if (err < 0 && err != -EAFNOSUPPORT) goto out_err; +#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */ warned = 0; return 0; From f738f5170367b367e38b2d75a413e7b3c52d46a5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:48:06 -0400 Subject: [PATCH 219/478] NFS: Start PF_INET6 callback listener only if IPv6 support is available Apparently a lot of people need to disable IPv6 completely on their distributor-built systems, which have CONFIG_IPV6_MODULE enabled at build time. They do this by blacklisting the ipv6.ko module. This causes the creation of the NFSv4 callback service listener to fail if CONFIG_IPV6_MODULE is set, but the module cannot be loaded. Now that the kernel's PF_INET6 RPC listeners are completely separate from PF_INET listeners, we can always start PF_INET. Then the NFS client can try to start a PF_INET6 listener, but it isn't required to be available. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 12 ++++++++++++ fs/nfs/callback.h | 1 + fs/nfs/nfs4state.c | 10 ++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0ef47dff89be..a886e692ddd0 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -38,6 +38,7 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; +unsigned short nfs_callback_tcpport6; static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; @@ -119,6 +120,17 @@ int nfs_callback_up(void) dprintk("NFS: Callback listener port = %u (af %u)\n", nfs_callback_tcpport, PF_INET); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + ret = svc_create_xprt(serv, "tcp", PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport6, PF_INET6); + } else if (ret != -EAFNOSUPPORT) + goto out_err; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); if (IS_ERR(nfs_callback_info.rqst)) { ret = PTR_ERR(nfs_callback_info.rqst); diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index bb25d2135ff1..e110e286a262 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -72,5 +72,6 @@ extern void nfs_callback_down(void); extern unsigned int nfs_callback_set_tcpport; extern unsigned short nfs_callback_tcpport; +extern unsigned short nfs_callback_tcpport6; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2022fe47966f..0298e909559f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list); static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) { - int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, - nfs_callback_tcpport, cred); + unsigned short port; + int status; + + port = nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); if (status == 0) status = nfs4_proc_setclientid_confirm(clp, cred); if (status == 0) From 3c8c45dfab78a1919f6f8a3ea46998c487eb7e12 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:48:14 -0400 Subject: [PATCH 220/478] NFS: Simplify logic to compare socket addresses in client.c Callback requests from IPv4 servers are now always guaranteed to be AF_INET, and never mapped IPv4 AF_INET6 addresses. Both nfs_match_client() and nfs_find_client() can now share the same address comparison logic, so fold them together. We can also dispense with of most of the conditional compilation in here. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 118 ++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 65 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 574158ae2398..855daac0f246 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -224,38 +224,6 @@ void nfs_put_client(struct nfs_client *clp) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped) -{ - switch (sa->sa_family) { - default: - return NULL; - case AF_INET6: - return &((const struct sockaddr_in6 *)sa)->sin6_addr; - break; - case AF_INET: - ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr, - addr_mapped); - return addr_mapped; - } -} - -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct in6_addr *addr1; - const struct in6_addr *addr2; - struct in6_addr addr1_mapped; - struct in6_addr addr2_mapped; - - addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped); - if (likely(addr1 != NULL)) { - addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped); - if (likely(addr2 != NULL)) - return ipv6_addr_equal(addr1, addr2); - } - return 0; -} - /* * Test if two ip6 socket addresses refer to the same socket by * comparing relevant fields. The padding bytes specifically, are not @@ -267,38 +235,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, * * The caller should ensure both socket addresses are AF_INET6. */ -static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, - const struct sockaddr *sa2) +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) { - const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - if (!ipv6_addr_equal(&saddr1->sin6_addr, - &saddr1->sin6_addr)) + if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + sin1->sin6_scope_id != sin2->sin6_scope_id) return 0; - if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - saddr1->sin6_scope_id != saddr2->sin6_scope_id) - return 0; - return saddr1->sin6_port == saddr2->sin6_port; -} -#else -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, - const struct sockaddr_in *sa2) -{ - return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; -} -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET)) - return 0; - return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, - (const struct sockaddr_in *)sa2); + return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); } - -static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, - const struct sockaddr * sa2) +#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) { return 0; } @@ -311,20 +262,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, * * The caller should ensure both socket addresses are AF_INET. */ +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + return nfs_sockaddr_match_ipaddr6(sa1, sa2) && + (sin1->sin6_port == sin2->sin6_port); +} + static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, const struct sockaddr *sa2) { - const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) - return 0; - return saddr1->sin_port == saddr2->sin_port; + return nfs_sockaddr_match_ipaddr4(sa1, sa2) && + (sin1->sin_port == sin2->sin_port); } /* * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields. + * by comparing (only) relevant fields, excluding the port number. + */ +static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_match_ipaddr4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_match_ipaddr6(sa1, sa2); + } + return 0; +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. */ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) From 32ec7fd08b597586774b92ac1cd2678021ccac1b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 28 Mar 2009 13:53:26 -0700 Subject: [PATCH 221/478] x86, setup: preemptively save/restore edi and ebp around INT 15 E820 Impact: BIOS bugproofing Since there are BIOSes known to clobber %ebx and %esi for INT 15 E820, assume there is something out there clobbering %edi and/or %ebp too, and don't wait for it to fail. Signed-off-by: H. Peter Anvin --- arch/x86/boot/memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index a99dbbe77a0c..fcdb10add9c8 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -20,7 +20,7 @@ static int detect_memory_e820(void) { int count = 0; u32 next = 0; - u32 size, id; + u32 size, id, edi; u8 err; struct e820entry *desc = boot_params.e820_map; @@ -29,10 +29,11 @@ static int detect_memory_e820(void) /* Important: %edx and %esi are clobbered by some BIOSes, so they must be either used for the error output - or explicitly marked clobbered. */ - asm("int $0x15; setc %0" + or explicitly marked clobbered. Given that, assume there + is something out there clobbering %ebp and %edi, too. */ + asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0" : "=d" (err), "+b" (next), "=a" (id), "+c" (size), - "=m" (*desc) + "=D" (edi), "=m" (*desc) : "D" (desc), "d" (SMAP), "a" (0xe820) : "esi"); From c549e71d073a6e9a4847497344db28a784061455 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 28 Mar 2009 13:53:26 -0700 Subject: [PATCH 222/478] x86, setup: ACPI 3, BIOS workaround for E820-probing code Impact: ACPI 3 spec compliance, BIOS bug workaround The ACPI 3 spec added another field to the E820 buffer -- which is backwards incompatible, since it contains a validity bit. Furthermore, there has been at least one report of a BIOS which assumes that the buffer it is pointed at is the same buffer as for the previous E820 call. Therefore, read the data into a temporary buffer and copy the standard part of it if and only if the valid bit is set. Signed-off-by: H. Peter Anvin --- arch/x86/boot/memory.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index fcdb10add9c8..d5d2360763dc 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -16,6 +17,11 @@ #define SMAP 0x534d4150 /* ASCII "SMAP" */ +struct e820_ext_entry { + struct e820entry std; + u32 ext_flags; +} __attribute__((packed)); + static int detect_memory_e820(void) { int count = 0; @@ -23,9 +29,10 @@ static int detect_memory_e820(void) u32 size, id, edi; u8 err; struct e820entry *desc = boot_params.e820_map; + static struct e820_ext_entry buf; /* static so it is zeroed */ do { - size = sizeof(struct e820entry); + size = sizeof buf; /* Important: %edx and %esi are clobbered by some BIOSes, so they must be either used for the error output @@ -33,8 +40,8 @@ static int detect_memory_e820(void) is something out there clobbering %ebp and %edi, too. */ asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0" : "=d" (err), "+b" (next), "=a" (id), "+c" (size), - "=D" (edi), "=m" (*desc) - : "D" (desc), "d" (SMAP), "a" (0xe820) + "=D" (edi), "+m" (buf) + : "D" (&buf), "d" (SMAP), "a" (0xe820) : "esi"); /* BIOSes which terminate the chain with CF = 1 as opposed @@ -53,8 +60,14 @@ static int detect_memory_e820(void) break; } + /* ACPI 3.0 added the extended flags support. If bit 0 + in the extended flags is zero, we're supposed to simply + ignore the entry -- a backwards incompatible change! */ + if (size > 20 && !(buf.ext_flags & 1)) + continue; + + *desc++ = buf.std; count++; - desc++; } while (next && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; From eeafda70bf2807544e96fa4e52b2433cd470ff46 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 29 Mar 2009 12:30:05 -0700 Subject: [PATCH 223/478] PCI: fix HT MSI mapping fix Impact: fix bug This patch reworks the nv_msi_ht_cap_quirk() and will only try to avoid to enable ht_msi on device following that root dev, and don't touch that root dev, but only do that trick with end_device on the chain. Reported-by: Prakash Punnoor Tested-by: Prakash Punnoor Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index faf02dd00015..9b2f0d96900d 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2206,6 +2206,33 @@ static int __devinit host_bridge_with_leaf(struct pci_dev *host_bridge) return found; } +#define PCI_HT_CAP_SLAVE_CTRL0 4 /* link control */ +#define PCI_HT_CAP_SLAVE_CTRL1 8 /* link control to */ + +static int __devinit is_end_of_ht_chain(struct pci_dev *dev) +{ + int pos, ctrl_off; + int end = 0; + u16 flags, ctrl; + + pos = pci_find_ht_capability(dev, HT_CAPTYPE_SLAVE); + + if (!pos) + goto out; + + pci_read_config_word(dev, pos + PCI_CAP_FLAGS, &flags); + + ctrl_off = ((flags >> 10) & 1) ? + PCI_HT_CAP_SLAVE_CTRL0 : PCI_HT_CAP_SLAVE_CTRL1; + pci_read_config_word(dev, pos + ctrl_off, &ctrl); + + if (ctrl & (1 << 6)) + end = 1; + +out: + return end; +} + static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev) { struct pci_dev *host_bridge; @@ -2230,8 +2257,9 @@ static void __devinit nv_ht_enable_msi_mapping(struct pci_dev *dev) if (!found) return; - /* don't enable host_bridge with leaf directly here */ - if (host_bridge == dev && host_bridge_with_leaf(host_bridge)) + /* don't enable end_device/host_bridge with leaf directly here */ + if (host_bridge == dev && is_end_of_ht_chain(host_bridge) && + host_bridge_with_leaf(host_bridge)) goto out; /* root did that ! */ From 5886cea45d1b230f86fd24de708b0d9f14ff88db Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 31 Mar 2009 19:13:13 +0200 Subject: [PATCH 224/478] [PATCH] sysrq: include interrupt.h instead of irq.h With "cpumask: update irq_desc to use cpumask_var_t" we get this build failure on s390: CC drivers/char/sysrq.o In file included from drivers/char/sysrq.c:38: include/linux/irq.h: In function 'init_alloc_desc_masks': include/linux/irq.h:442: error: dereferencing pointer to incomplete type drivers/char/sysrq.c should include interrupt.h instead of irq.h. Cc: Mike Travis Cc: Ingo Molnar Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/char/sysrq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 33a9351c896d..3df694e54545 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include From 33b26d7951619e4debf3a82f30a03423a36a1412 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 31 Mar 2009 19:16:02 +0200 Subject: [PATCH 225/478] [S390] fix hypfs build failure Fix build breakage below which probably was introduced with ("rcu: don't include unnecessary headers, allow kmemtrace w/ tracepoints"). CC arch/s390/hypfs/hypfs_diag.o arch/s390/hypfs/hypfs_diag.c: In function 'diag204_free_buffer': arch/s390/hypfs/hypfs_diag.c:364: error: implicit declaration of function 'free_pages' arch/s390/hypfs/hypfs_diag.c: In function 'diag204_alloc_rbuf': arch/s390/hypfs/hypfs_diag.c:384: error: implicit declaration of function '__get_free_pages' arch/s390/hypfs/hypfs_diag.c:384: error: 'GFP_KERNEL' undeclared (first use in this function) arch/s390/hypfs/hypfs_diag.c:384: error: (Each undeclared identifier is reported only once arch/s390/hypfs/hypfs_diag.c:384: error: for each function it appears in.) Reported-by: Sachin Sant Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/hypfs/hypfs_diag.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index b1e892a43816..704dd396257b 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -12,6 +12,8 @@ #include #include +#include +#include #include #include #include From 156013ffd1225ef862853a4340b46f76845f8db1 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 31 Mar 2009 19:16:03 +0200 Subject: [PATCH 226/478] [S390] cio: wake up on failed recognition Wake up even on failed device recognition, since this may be triggered from a user trying to force a device online. With this patch a write to the online sysfs attribute will not block for ever but return with -EAGAIN in this case. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device.c | 10 ++++++++-- drivers/s390/cio/device_fsm.c | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index c4d2f667a2f6..9f016fef6101 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -482,17 +482,21 @@ static int online_store_recog_and_online(struct ccw_device *cdev) } wait_event(cdev->private->wait_q, cdev->private->flags.recog_done); + if (cdev->private->state != DEV_STATE_OFFLINE) + /* recognition failed */ + return -EAGAIN; } if (cdev->drv && cdev->drv->set_online) ccw_device_set_online(cdev); return 0; } + static int online_store_handle_online(struct ccw_device *cdev, int force) { int ret; ret = online_store_recog_and_online(cdev); - if (ret) + if (ret && !force) return ret; if (force && cdev->private->state == DEV_STATE_BOXED) { ret = ccw_device_stlck(cdev); @@ -500,7 +504,9 @@ static int online_store_handle_online(struct ccw_device *cdev, int force) return ret; if (cdev->id.cu_type == 0) cdev->private->state = DEV_STATE_NOT_OPER; - online_store_recog_and_online(cdev); + ret = online_store_recog_and_online(cdev); + if (ret) + return ret; } return 0; } diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 87b4bfca080f..ccd72f95765f 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -260,6 +260,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state) if (state == DEV_STATE_NOT_OPER) { cdev->private->flags.recog_done = 1; cdev->private->state = DEV_STATE_DISCONNECTED; + wake_up(&cdev->private->wait_q); return; } /* Boxed devices don't need extra treatment. */ @@ -311,8 +312,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state) } cdev->private->state = state; io_subchannel_recog_done(cdev); - if (state != DEV_STATE_NOT_OPER) - wake_up(&cdev->private->wait_q); + wake_up(&cdev->private->wait_q); } /* From c4621a62649a56f155a96dfc5de479be226f0768 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 31 Mar 2009 19:16:04 +0200 Subject: [PATCH 227/478] [S390] cio: introduce ccw_device_schedule_sch_unregister Introduce ccw_device_schedule_sch_unregister as a wrapper for queuing ccw_device_call_sch_unregister on the slow_path_wq. This wrapper will be used in the next patch. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device.c | 21 +++++++++++---------- drivers/s390/cio/device.h | 1 + 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 9f016fef6101..cdbf664ed446 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -310,8 +310,6 @@ static void ccw_device_remove_orphan_cb(struct work_struct *work) put_device(&cdev->dev); } -static void ccw_device_call_sch_unregister(struct work_struct *work); - static void ccw_device_remove_disconnected(struct ccw_device *cdev) { @@ -335,11 +333,10 @@ ccw_device_remove_disconnected(struct ccw_device *cdev) spin_unlock_irqrestore(cdev->ccwlock, flags); PREPARE_WORK(&cdev->private->kick_work, ccw_device_remove_orphan_cb); + queue_work(slow_path_wq, &cdev->private->kick_work); } else /* Deregister subchannel, which will kill the ccw device. */ - PREPARE_WORK(&cdev->private->kick_work, - ccw_device_call_sch_unregister); - queue_work(slow_path_wq, &cdev->private->kick_work); + ccw_device_schedule_sch_unregister(cdev); } /** @@ -1020,6 +1017,13 @@ static void ccw_device_call_sch_unregister(struct work_struct *work) put_device(&sch->dev); } +void ccw_device_schedule_sch_unregister(struct ccw_device *cdev) +{ + PREPARE_WORK(&cdev->private->kick_work, + ccw_device_call_sch_unregister); + queue_work(slow_path_wq, &cdev->private->kick_work); +} + /* * subchannel recognition done. Called from the state machine. */ @@ -1036,9 +1040,7 @@ io_subchannel_recog_done(struct ccw_device *cdev) /* Remove device found not operational. */ if (!get_device(&cdev->dev)) break; - PREPARE_WORK(&cdev->private->kick_work, - ccw_device_call_sch_unregister); - queue_work(slow_path_wq, &cdev->private->kick_work); + ccw_device_schedule_sch_unregister(cdev); if (atomic_dec_and_test(&ccw_device_init_count)) wake_up(&ccw_device_init_wq); break; @@ -1557,8 +1559,7 @@ static int purge_fn(struct device *dev, void *data) goto out; CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x\n", priv->dev_id.ssid, priv->dev_id.devno); - PREPARE_WORK(&cdev->private->kick_work, ccw_device_call_sch_unregister); - queue_work(slow_path_wq, &cdev->private->kick_work); + ccw_device_schedule_sch_unregister(cdev); out: /* Abort loop in case of pending signal. */ diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h index 85e01846ca65..f1cbbd94ad4e 100644 --- a/drivers/s390/cio/device.h +++ b/drivers/s390/cio/device.h @@ -87,6 +87,7 @@ int ccw_device_is_orphan(struct ccw_device *); int ccw_device_recognition(struct ccw_device *); int ccw_device_online(struct ccw_device *); int ccw_device_offline(struct ccw_device *); +void ccw_device_schedule_sch_unregister(struct ccw_device *); int ccw_purge_blacklisted(void); /* Function prototypes for device status and basic sense stuff. */ From 47593bfa1056d306fde067b28dd8617009be4121 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 31 Mar 2009 19:16:05 +0200 Subject: [PATCH 228/478] [S390] cio: introduce notifier for boxed state If a ccw device did not respond in time during internal io, we set it into boxed state. With this patch we have the following behaviour: * the ccw driver will get a notification if the device was online and goes into the boxed state * if the device was disconnected and got boxed nothing special is to be done (it will be handled in reprobing later) * if the device got boxed while initial sensing it will be unregistered Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cio.h | 2 ++ drivers/s390/block/dasd.c | 1 + drivers/s390/cio/device.c | 4 ++-- drivers/s390/cio/device_fsm.c | 29 ++++++++++++++++++----------- drivers/s390/scsi/zfcp_ccw.c | 5 +++++ 5 files changed, 28 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 6dccb071aec3..619bf94b11f1 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -456,6 +456,8 @@ struct ciw { #define CIO_OPER 0x0004 /* Sick revalidation of device. */ #define CIO_REVALIDATE 0x0008 +/* Device did not respond in time. */ +#define CIO_BOXED 0x0010 /** * struct ccw_dev_id - unique identifier for ccw devices diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 2fd64e5a9ab2..0570794ccf1c 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2363,6 +2363,7 @@ int dasd_generic_notify(struct ccw_device *cdev, int event) ret = 0; switch (event) { case CIO_GONE: + case CIO_BOXED: case CIO_NO_PATH: /* First of all call extended error reporting. */ dasd_eer_write(device, NULL, DASD_EER_NOPATH); diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index cdbf664ed446..868f8c6b053a 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1035,6 +1035,8 @@ io_subchannel_recog_done(struct ccw_device *cdev) return; } switch (cdev->private->state) { + case DEV_STATE_BOXED: + /* Device did not respond in time. */ case DEV_STATE_NOT_OPER: cdev->private->flags.recog_done = 1; /* Remove device found not operational. */ @@ -1044,8 +1046,6 @@ io_subchannel_recog_done(struct ccw_device *cdev) if (atomic_dec_and_test(&ccw_device_init_count)) wake_up(&ccw_device_init_wq); break; - case DEV_STATE_BOXED: - /* Device did not respond in time. */ case DEV_STATE_OFFLINE: /* * We can't register the device in interrupt context so diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index ccd72f95765f..e46049261561 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -256,14 +256,12 @@ ccw_device_recog_done(struct ccw_device *cdev, int state) old_lpm = 0; if (sch->lpm != old_lpm) __recover_lost_chpids(sch, old_lpm); - if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) { - if (state == DEV_STATE_NOT_OPER) { - cdev->private->flags.recog_done = 1; - cdev->private->state = DEV_STATE_DISCONNECTED; - wake_up(&cdev->private->wait_q); - return; - } - /* Boxed devices don't need extra treatment. */ + if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID && + (state == DEV_STATE_NOT_OPER || state == DEV_STATE_BOXED)) { + cdev->private->flags.recog_done = 1; + cdev->private->state = DEV_STATE_DISCONNECTED; + wake_up(&cdev->private->wait_q); + return; } notify = 0; same_dev = 0; /* Keep the compiler quiet... */ @@ -275,7 +273,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state) sch->schid.ssid, sch->schid.sch_no); break; case DEV_STATE_OFFLINE: - if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) { + if (cdev->online) { same_dev = ccw_device_handle_oper(cdev); notify = 1; } @@ -308,6 +306,12 @@ ccw_device_recog_done(struct ccw_device *cdev, int state) " subchannel 0.%x.%04x\n", cdev->private->dev_id.devno, sch->schid.ssid, sch->schid.sch_no); + if (cdev->id.cu_type != 0) { /* device was recognized before */ + cdev->private->flags.recog_done = 1; + cdev->private->state = DEV_STATE_BOXED; + wake_up(&cdev->private->wait_q); + return; + } break; } cdev->private->state = state; @@ -390,10 +394,13 @@ ccw_device_done(struct ccw_device *cdev, int state) cdev->private->state = state; - - if (state == DEV_STATE_BOXED) + if (state == DEV_STATE_BOXED) { CIO_MSG_EVENT(0, "Boxed device %04x on subchannel %04x\n", cdev->private->dev_id.devno, sch->schid.sch_no); + if (cdev->online && !ccw_device_notify(cdev, CIO_BOXED)) + ccw_device_schedule_sch_unregister(cdev); + cdev->private->flags.donotify = 0; + } if (cdev->private->flags.donotify) { cdev->private->flags.donotify = 0; diff --git a/drivers/s390/scsi/zfcp_ccw.c b/drivers/s390/scsi/zfcp_ccw.c index 1fe1e2eda512..cfb0dcb6e3ff 100644 --- a/drivers/s390/scsi/zfcp_ccw.c +++ b/drivers/s390/scsi/zfcp_ccw.c @@ -176,6 +176,11 @@ static int zfcp_ccw_notify(struct ccw_device *ccw_device, int event) zfcp_erp_adapter_reopen(adapter, ZFCP_STATUS_COMMON_ERP_FAILED, "ccnoti4", NULL); break; + case CIO_BOXED: + dev_warn(&adapter->ccw_device->dev, + "The ccw device did not respond in time.\n"); + zfcp_erp_adapter_shutdown(adapter, 0, "ccnoti5", NULL); + break; } return 1; } From b5cd99e6b002776c0e946f38292adbb0258b7983 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 31 Mar 2009 19:16:06 +0200 Subject: [PATCH 229/478] [S390] cio: disallow online setting of device in transient state Return -EAGAIN on writes to sysfs online attribute if the corresponding ccw device is in transient state. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 868f8c6b053a..e47aa3f04769 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -515,7 +515,11 @@ static ssize_t online_store (struct device *dev, struct device_attribute *attr, int force, ret; unsigned long i; - if (atomic_cmpxchg(&cdev->private->onoff, 0, 1) != 0) + if ((cdev->private->state != DEV_STATE_OFFLINE && + cdev->private->state != DEV_STATE_ONLINE && + cdev->private->state != DEV_STATE_BOXED && + cdev->private->state != DEV_STATE_DISCONNECTED) || + atomic_cmpxchg(&cdev->private->onoff, 0, 1) != 0) return -EAGAIN; if (cdev->drv && !try_module_get(cdev->drv->owner)) { From 99f6a570eedc885675b6aa36b7acdbdcc3a7f55b Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 31 Mar 2009 19:16:07 +0200 Subject: [PATCH 230/478] [S390] cio: online_store - trigger recognition for boxed devices Start a new device recognition if someone writes to sysfs online attribute of a boxed ccw device. The current test will fail, since cu_type != 0 for devices which were recognized before. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index e47aa3f04769..35441fa16be1 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -468,7 +468,7 @@ static int online_store_recog_and_online(struct ccw_device *cdev) int ret; /* Do device recognition, if needed. */ - if (cdev->id.cu_type == 0) { + if (cdev->private->state == DEV_STATE_BOXED) { ret = ccw_device_recognition(cdev); if (ret) { CIO_MSG_EVENT(0, "Couldn't start recognition " From 9010941c5483a7a5bb1f7d97ee62491fb078bb51 Mon Sep 17 00:00:00 2001 From: Elias Oltmanns Date: Tue, 31 Mar 2009 20:14:56 +0200 Subject: [PATCH 231/478] ide: Fix code dealing with sleeping devices in do_ide_request() Unfortunately, I missed a catch when reviewing the patch committed as 201bffa4. Here is the fix to the currently broken handling of sleeping devices. In particular, this is required to get the disk shock protection code working again. Reported-by: Christian Thaeter Cc: stable@kernel.org Signed-off-by: Elias Oltmanns Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 1adc5e2e7fb3..3c52317d8524 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -481,11 +481,10 @@ repeat: prev_port = hwif->host->cur_port; hwif->rq = NULL; - if (drive->dev_flags & IDE_DFLAG_SLEEPING) { - if (time_before(drive->sleep, jiffies)) { - ide_unlock_port(hwif); - goto plug_device; - } + if (drive->dev_flags & IDE_DFLAG_SLEEPING && + time_after(drive->sleep, jiffies)) { + ide_unlock_port(hwif); + goto plug_device; } if ((hwif->host->host_flags & IDE_HFLAG_SERIALIZE) && From 479edf065576aeed7ac99d10838bb3b4f870b5f9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 31 Mar 2009 20:14:57 +0200 Subject: [PATCH 232/478] ide: drivers/ide/ide-atapi.c needs On m68k: | drivers/ide/ide-atapi.c: In function 'ide_io_buffers': | drivers/ide/ide-atapi.c:87: error: implicit declaration of function 'sg_page' | drivers/ide/ide-atapi.c:87: warning: passing argument 1 of 'PageHighMem' makes pointer from integer without a cast | drivers/ide/ide-atapi.c:91: warning: passing argument 1 of 'kmap_atomic' makes pointer from integer without a cast | drivers/ide/ide-atapi.c:96: error: implicit declaration of function 'sg_virt' | drivers/ide/ide-atapi.c:96: warning: assignment makes pointer from integer without a cast | drivers/ide/ide-atapi.c:107: error: implicit declaration of function 'sg_next' | drivers/ide/ide-atapi.c:107: warning: assignment makes pointer from integer without a cast [bart: Dmitri Vorobiev submitted similar patch fixing MIPS] Signed-off-by: Geert Uytterhoeven Cc: Dmitri Vorobiev Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 2fb5d28a9be5..d937e45a6777 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -6,6 +6,8 @@ #include #include #include +#include + #include #ifdef DEBUG From da19620d99377a52b953245089f831a9c3f049c2 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:14:57 +0200 Subject: [PATCH 233/478] at91_ide: fix ->ftf_flags handling Fix some incorrect IDE_FTFLAG_* changes which slipped in commit "ide: add "flagged" taskfile flags to struct ide_taskfile (v2)" (commit 19710d25d50ae0be05eebe4231ed8918b1092d82) few days ago. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/at91_ide.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index 27547121daff..6bb45a2b24c2 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -192,10 +192,10 @@ static void at91_ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_taskfile *tf = &cmd->tf; u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF; - if (cmd->tf_flags & IDE_FTFLAG_FLAGGED) + if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->tf_flags & IDE_FTFLAG_OUT_DATA) { + if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { u16 data = (tf->hob_data << 8) | tf->data; at91_ide_output_data(drive, NULL, &data, 2); @@ -233,7 +233,7 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_io_ports *io_ports = &hwif->io_ports; struct ide_taskfile *tf = &cmd->tf; - if (cmd->tf_flags & IDE_FTFLAG_IN_DATA) { + if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { u16 data; at91_ide_input_data(drive, NULL, &data, 2); From 2eba08270990b99fb5429b76ee97184ddd272f7f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 Mar 2009 20:14:58 +0200 Subject: [PATCH 234/478] ide-atapi: start DMA after issuing a packet command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apparently¹, some ATAPI devices want to see the packet command first before enabling DMA otherwise they simply hang indefinitely. Reorder the two steps and start DMA only after having issued the command first. [1] http://marc.info/?l=linux-kernel&m=123835520317235&w=2 Signed-off-by: Borislav Petkov Reported-by: Michael Roth Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index d937e45a6777..f591166d2c93 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -613,6 +613,10 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive) : ide_pc_intr), timeout); + /* Send the actual packet */ + if ((drive->atapi_flags & IDE_AFLAG_ZIP_DRIVE) == 0) + hwif->tp_ops->output_data(drive, NULL, rq->cmd, cmd_len); + /* Begin DMA, if necessary */ if (dev_is_idecd(drive)) { if (drive->dma) @@ -624,10 +628,6 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive) } } - /* Send the actual packet */ - if ((drive->atapi_flags & IDE_AFLAG_ZIP_DRIVE) == 0) - hwif->tp_ops->output_data(drive, NULL, rq->cmd, cmd_len); - return ide_started; } From 7a00798b1a7502ff31736152b23189138db0b978 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:14:59 +0200 Subject: [PATCH 235/478] ide: add support for arbitrary transfer lengths to ide_pio_bytes() Add support for arbitrary transfer lengths to ide_pio_bytes() and then inline ide_pio_multi() into ide_pio_datablock(). There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-taskfile.c | 73 +++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 84532be97c00..8d7e87d04b3b 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -189,7 +189,7 @@ static u8 wait_drive_not_busy(ide_drive_t *drive) } static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, - unsigned int write, unsigned int nr_bytes) + unsigned int write, unsigned int len) { ide_hwif_t *hwif = drive->hwif; struct scatterlist *sg = hwif->sg_table; @@ -202,56 +202,55 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf; cursg = cmd->cursg; - if (!cursg) { - cursg = sg; - cmd->cursg = sg; - } + if (cursg == NULL) + cursg = cmd->cursg = sg; - page = sg_page(cursg); - offset = cursg->offset + cmd->cursg_ofs; + while (len) { + unsigned nr_bytes = min(len, cursg->length - cmd->cursg_ofs); - /* get the current page and offset */ - page = nth_page(page, (offset >> PAGE_SHIFT)); - offset %= PAGE_SIZE; + if (nr_bytes > PAGE_SIZE) + nr_bytes = PAGE_SIZE; + + page = sg_page(cursg); + offset = cursg->offset + cmd->cursg_ofs; + + /* get the current page and offset */ + page = nth_page(page, (offset >> PAGE_SHIFT)); + offset %= PAGE_SIZE; #ifdef CONFIG_HIGHMEM - local_irq_save(flags); + local_irq_save(flags); #endif - buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; + buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; - cmd->nleft -= nr_bytes; - cmd->cursg_ofs += nr_bytes; + cmd->nleft -= nr_bytes; + cmd->cursg_ofs += nr_bytes; - if (cmd->cursg_ofs == cursg->length) { - cmd->cursg = sg_next(cmd->cursg); - cmd->cursg_ofs = 0; - } + if (cmd->cursg_ofs == cursg->length) { + cursg = cmd->cursg = sg_next(cmd->cursg); + cmd->cursg_ofs = 0; + } - /* do the actual data transfer */ - if (write) - hwif->tp_ops->output_data(drive, cmd, buf, nr_bytes); - else - hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes); + /* do the actual data transfer */ + if (write) + hwif->tp_ops->output_data(drive, cmd, buf, nr_bytes); + else + hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes); - kunmap_atomic(buf, KM_BIO_SRC_IRQ); + kunmap_atomic(buf, KM_BIO_SRC_IRQ); #ifdef CONFIG_HIGHMEM - local_irq_restore(flags); + local_irq_restore(flags); #endif -} -static void ide_pio_multi(ide_drive_t *drive, struct ide_cmd *cmd, - unsigned int write) -{ - unsigned int nsect; - - nsect = min_t(unsigned int, cmd->nleft >> 9, drive->mult_count); - while (nsect--) - ide_pio_bytes(drive, cmd, write, SECTOR_SIZE); + len -= nr_bytes; + } } static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd, unsigned int write) { + unsigned int nr_bytes; + u8 saved_io_32bit = drive->io_32bit; if (cmd->tf_flags & IDE_TFLAG_FS) @@ -263,9 +262,11 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd, touch_softlockup_watchdog(); if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO) - ide_pio_multi(drive, cmd, write); + nr_bytes = min_t(unsigned, cmd->nleft, drive->mult_count << 9); else - ide_pio_bytes(drive, cmd, write, SECTOR_SIZE); + nr_bytes = SECTOR_SIZE; + + ide_pio_bytes(drive, cmd, write, nr_bytes); drive->io_32bit = saved_io_32bit; } From f2bc316736e69e5623443a010f9581a01429c075 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:14:59 +0200 Subject: [PATCH 236/478] ide: use PageHighMem() instead of ifdefs in ide_pio_bytes() Use PageHighMem() instead of ifdefs in ide_pio_bytes() (=> local IRQs won't be disabled when not necessary). Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-taskfile.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 8d7e87d04b3b..0e333ecf2ad6 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -195,9 +195,7 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, struct scatterlist *sg = hwif->sg_table; struct scatterlist *cursg = cmd->cursg; struct page *page; -#ifdef CONFIG_HIGHMEM unsigned long flags; -#endif unsigned int offset; u8 *buf; @@ -218,9 +216,9 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, page = nth_page(page, (offset >> PAGE_SHIFT)); offset %= PAGE_SIZE; -#ifdef CONFIG_HIGHMEM - local_irq_save(flags); -#endif + if (PageHighMem(page)) + local_irq_save(flags); + buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; cmd->nleft -= nr_bytes; @@ -238,9 +236,9 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes); kunmap_atomic(buf, KM_BIO_SRC_IRQ); -#ifdef CONFIG_HIGHMEM - local_irq_restore(flags); -#endif + + if (PageHighMem(page)) + local_irq_restore(flags); len -= nr_bytes; } From 116e690f4e69ce0458a9be7010c80b59eb7a99d8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:14:59 +0200 Subject: [PATCH 237/478] ide-cd: remove dead URLs Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 3f630e4080d4..a71ca2a9ab4b 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -12,12 +12,9 @@ * See Documentation/cdrom/ide-cd for usage information. * * Suggestions are welcome. Patches that work are more welcome though. ;-) - * For those wishing to work on this driver, please be sure you download - * and comply with the latest Mt. Fuji (SFF8090 version 4) and ATAPI - * (SFF-8020i rev 2.6) standards. These documents can be obtained by - * anonymous ftp from: - * ftp://fission.dt.wdc.com/pub/standards/SFF_atapi/spec/SFF8020-r2.6/PS/8020r26.ps - * ftp://ftp.avc-pioneer.com/Mtfuji4/Spec/Fuji4r10.pdf + * + * Documentation: + * Mt. Fuji (SFF8090 version 4) and ATAPI (SFF-8020i rev 2.6) standards. * * For historical changelog please see: * Documentation/ide/ChangeLog.ide-cd.1994-2004 From bf12a9c1c95e1b0204fc2fc9fe625a056e284f5a Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:00 +0200 Subject: [PATCH 238/478] ide-cd: use ide_end_rq() also for failed non-fs requests Use ide_end_rq() also for failed non-fs requests on completion of REQUEST SENSE requests + use blk_rq_bytes() while at it. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index a71ca2a9ab4b..6f64fb2f63d0 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -265,18 +265,10 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate) failed->sense_len = rq->sense_len; } cdrom_analyze_sense_data(drive, failed, sense); - /* - * now end the failed request - */ - if (blk_fs_request(failed)) { - if (ide_end_rq(drive, failed, -EIO, - failed->hard_nr_sectors << 9)) - BUG(); - } else { - if (blk_end_request(failed, -EIO, - failed->data_len)) - BUG(); - } + + if (ide_end_rq(drive, failed, -EIO, + blk_rq_bytes(failed))) + BUG(); } else cdrom_analyze_sense_data(drive, NULL, sense); } From 13eae6a48fc57495eb9e733430b8fc20df7bf415 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:00 +0200 Subject: [PATCH 239/478] ide-cd: remove dead code from cdrom_decode_status() There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 6f64fb2f63d0..4a289711c551 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -321,12 +321,6 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret) err = ide_read_error(drive); sense_key = err >> 4; - if (rq == NULL) { - printk(KERN_ERR PFX "%s: missing rq in %s\n", - drive->name, __func__); - return 1; - } - ide_debug_log(IDE_DBG_RQ, "stat: 0x%x, good_stat: 0x%x, cmd[0]: 0x%x, " "rq->cmd_type: 0x%x, err: 0x%x", stat, good_stat, rq->cmd[0], rq->cmd_type, From 1ab6d7451684078bfc4fbabc432f0ef8a809e975 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:00 +0200 Subject: [PATCH 240/478] ide-cd: remove needless ide_dump_status_no_sense() wrapper It makes no sense to check for BSY bit being set as earlier OK_STAT() check in cdrom_end_request() makes sure that BSY bit is cleared. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 4a289711c551..19ccadead5e8 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -290,13 +290,6 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate) ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); } -static void ide_dump_status_no_sense(ide_drive_t *drive, const char *msg, u8 st) -{ - if (st & 0x80) - return; - ide_dump_status(drive, msg, st); -} - /* * Returns: * 0: if the request should be continued. @@ -439,21 +432,19 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret) * No point in retrying after an illegal request or data * protect error. */ - ide_dump_status_no_sense(drive, "command error", stat); + ide_dump_status(drive, "command error", stat); do_end_request = 1; } else if (sense_key == MEDIUM_ERROR) { /* * No point in re-trying a zillion times on a bad * sector. If we got here the error is not correctable. */ - ide_dump_status_no_sense(drive, - "media error (bad sector)", - stat); + ide_dump_status(drive, "media error (bad sector)", + stat); do_end_request = 1; } else if (sense_key == BLANK_CHECK) { /* disk appears blank ?? */ - ide_dump_status_no_sense(drive, "media error (blank)", - stat); + ide_dump_status(drive, "media error (blank)", stat); do_end_request = 1; } else if ((err & ~ATA_ABORTED) != 0) { /* go to the default handler for other errors */ From 6041e8fba8b9a9a64bd7402be700b0f1247a9c55 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:01 +0200 Subject: [PATCH 241/478] ide-cd: remove no longer needed 'ignore' module parameter ide-scsi is gone... Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 19ccadead5e8..9d3150f549fe 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1942,9 +1942,6 @@ static struct block_device_operations idecd_ops = { }; /* module options */ -static char *ignore; -module_param(ignore, charp, 0400); - static unsigned long debug_mask; module_param(debug_mask, ulong, 0644); @@ -1965,15 +1962,6 @@ static int ide_cd_probe(ide_drive_t *drive) if (drive->media != ide_cdrom && drive->media != ide_optical) goto failed; - /* skip drives that we were told to ignore */ - if (ignore != NULL) { - if (strstr(ignore, drive->name)) { - printk(KERN_INFO PFX "ignoring drive %s\n", - drive->name); - goto failed; - } - } - drive->debug_mask = debug_mask; drive->irq_handler = cdrom_newpc_intr; From 299c4852fc6995e0665d246927d25cefd4dad754 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:01 +0200 Subject: [PATCH 242/478] ide-cd: factor out failed request completion from cdrom_end_request() There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 47 +++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 9d3150f549fe..eb0c2fe2b874 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -242,6 +242,29 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0); } +static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) +{ + /* + * For REQ_TYPE_SENSE, "rq->buffer" points to the original + * failed request + */ + struct request *failed = (struct request *)rq->buffer; + struct cdrom_info *info = drive->driver_data; + void *sense = &info->sense_data; + + if (failed) { + if (failed->sense) { + sense = failed->sense; + failed->sense_len = rq->sense_len; + } + cdrom_analyze_sense_data(drive, failed, sense); + + if (ide_end_rq(drive, failed, -EIO, blk_rq_bytes(failed))) + BUG(); + } else + cdrom_analyze_sense_data(drive, NULL, sense); +} + static void cdrom_end_request(ide_drive_t *drive, int uptodate) { struct request *rq = drive->hwif->rq; @@ -250,28 +273,8 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate) ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, uptodate: 0x%x, nsectors: %d", rq->cmd[0], uptodate, nsectors); - if (blk_sense_request(rq) && uptodate) { - /* - * For REQ_TYPE_SENSE, "rq->buffer" points to the original - * failed request - */ - struct request *failed = (struct request *) rq->buffer; - struct cdrom_info *info = drive->driver_data; - void *sense = &info->sense_data; - - if (failed) { - if (failed->sense) { - sense = failed->sense; - failed->sense_len = rq->sense_len; - } - cdrom_analyze_sense_data(drive, failed, sense); - - if (ide_end_rq(drive, failed, -EIO, - blk_rq_bytes(failed))) - BUG(); - } else - cdrom_analyze_sense_data(drive, NULL, sense); - } + if (blk_sense_request(rq) && uptodate) + ide_cd_complete_failed_rq(drive, rq); if (!rq->current_nr_sectors && blk_fs_request(rq)) uptodate = 1; From e0458ccb0668edbecbc1ae1c17ed58a6b1a4ff3e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:02 +0200 Subject: [PATCH 243/478] ide-cd: unify ide_cd_do_request() exit paths * Move cdrom_end_request() calls from cdrom_start_rw() and ide_cd_prepare_rw_request() to ide_cd_do_request(). * Unify ide_cd_do_request() exit paths. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index eb0c2fe2b874..3e3058cff843 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -595,7 +595,6 @@ static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive, printk(KERN_ERR PFX "%s: %s: buffer botch (%u)\n", drive->name, __func__, rq->current_nr_sectors); - cdrom_end_request(drive, 0); return ide_stopped; } rq->current_nr_sectors += nskip; @@ -972,10 +971,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) if (write) { /* disk has become write protected */ - if (get_disk_ro(cd->disk)) { - cdrom_end_request(drive, 0); + if (get_disk_ro(cd->disk)) return ide_stopped; - } } else { /* * We may be retrying this request after an error. Fix up any @@ -987,10 +984,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) /* use DMA, if possible / writes *must* be hardware frame aligned */ if ((rq->nr_sectors & (sectors_per_frame - 1)) || (rq->sector & (sectors_per_frame - 1))) { - if (write) { - cdrom_end_request(drive, 0); + if (write) return ide_stopped; - } drive->dma = 0; } else drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); @@ -1045,6 +1040,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, sector_t block) { struct ide_cmd cmd; + int uptodate = 0; ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, block: %llu", rq->cmd[0], (unsigned long long)block); @@ -1053,11 +1049,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, blk_dump_rq_flags(rq, "ide_cd_do_request"); if (blk_fs_request(rq)) { - if (cdrom_start_rw(drive, rq) == ide_stopped) - return ide_stopped; - - if (ide_cd_prepare_rw_request(drive, rq) == ide_stopped) - return ide_stopped; + if (cdrom_start_rw(drive, rq) == ide_stopped || + ide_cd_prepare_rw_request(drive, rq) == ide_stopped) + goto out_end; } else if (blk_sense_request(rq) || blk_pc_request(rq) || rq->cmd_type == REQ_TYPE_ATA_PC) { if (!rq->timeout) @@ -1066,12 +1060,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cdrom_do_block_pc(drive, rq); } else if (blk_special_request(rq)) { /* right now this can only be a reset... */ - cdrom_end_request(drive, 1); - return ide_stopped; + uptodate = 1; + goto out_end; } else { blk_dump_rq_flags(rq, DRV_NAME " bad flags"); - cdrom_end_request(drive, 0); - return ide_stopped; + goto out_end; } memset(&cmd, 0, sizeof(cmd)); @@ -1082,6 +1075,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cmd.rq = rq; return ide_issue_pc(drive, &cmd); +out_end: + cdrom_end_request(drive, uptodate); + return ide_stopped; } /* From 984c5e5974227d2d4dba58cdf19af641f89be83f Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:03 +0200 Subject: [PATCH 244/478] ide-cd: move setting REQ_FAILED flag out from 'end_request' exit path Move setting REQ_FAILED flag out from 'end_request' exit path in cdrom_newpc_intr() and also rename 'end_request' to 'out_end'. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 3e3058cff843..b66da3f1678e 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -776,7 +776,9 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) ide_complete_rq(drive, 0, 512); return ide_stopped; } - goto end_request; + if (blk_pc_request(rq) == 0 && uptodate == 0) + rq->cmd_flags |= REQ_FAILED; + goto out_end; } ide_read_bcount_and_ireason(drive, &len, &ireason); @@ -811,8 +813,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) ide_cd_request_sense_fixup(drive, rq); /* complain if we still have data left to transfer */ uptodate = rq->data_len ? 0 : 1; + if (uptodate == 0) + rq->cmd_flags |= REQ_FAILED; } - goto end_request; + goto out_end; } /* check which way to transfer data */ @@ -939,7 +943,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) ide_set_handler(drive, cdrom_newpc_intr, timeout); return ide_started; -end_request: +out_end: if (blk_pc_request(rq)) { unsigned int dlen = rq->data_len; @@ -951,8 +955,6 @@ end_request: hwif->rq = NULL; } else { - if (!uptodate) - rq->cmd_flags |= REQ_FAILED; cdrom_end_request(drive, uptodate); } return ide_stopped; From 8a116974852a727bdfe6b1b897102903a17228a5 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:03 +0200 Subject: [PATCH 245/478] ide-cd: unify cdrom_newpc_intr() exit paths * Move cdrom_end_request() calls from cdrom_decode_status() and ide_cd_check_ireason() to cdrom_newpc_intr(). * Unify cdrom_newpc_intr() exit paths. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 48 +++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index b66da3f1678e..4c32e8db55c3 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -296,7 +296,8 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate) /* * Returns: * 0: if the request should be continued. - * 1: if the request was ended. + * 1: if the request will be going through error recovery. + * 2: if the request should be ended. */ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret) { @@ -329,10 +330,7 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret) * Just give up. */ rq->cmd_flags |= REQ_FAILED; - cdrom_end_request(drive, 0); - ide_error(drive, "request sense failure", stat); - return 1; - + return 2; } else if (blk_pc_request(rq) || rq->cmd_type == REQ_TYPE_ATA_PC) { /* All other functions, except for READ. */ @@ -472,14 +470,12 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret) */ if (stat & ATA_ERR) cdrom_queue_request_sense(drive, NULL, NULL); + return 1; } else { blk_dump_rq_flags(rq, PFX "bad rq"); - cdrom_end_request(drive, 0); + return 2; } - /* retry, or handle the next request */ - return 1; - end_request: if (stat & ATA_ERR) { struct request_queue *q = drive->queue; @@ -492,10 +488,9 @@ end_request: hwif->rq = NULL; cdrom_queue_request_sense(drive, rq->sense, rq); + return 1; } else - cdrom_end_request(drive, 0); - - return 1; + return 2; } /* @@ -539,7 +534,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq, if (rq->cmd_type == REQ_TYPE_ATA_PC) rq->cmd_flags |= REQ_FAILED; - cdrom_end_request(drive, 0); return -1; } @@ -741,7 +735,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) xfer_func_t *xferfunc; ide_expiry_t *expiry = NULL; int dma_error = 0, dma, stat, thislen, uptodate = 0; - int write = (rq_data_dir(rq) == WRITE) ? 1 : 0; + int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc; + int sense = blk_sense_request(rq); unsigned int timeout; u16 len; u8 ireason; @@ -761,8 +756,12 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) } } - if (cdrom_decode_status(drive, 0, &stat)) + rc = cdrom_decode_status(drive, 0, &stat); + if (rc) { + if (rc == 2) + goto out_end; return ide_stopped; + } /* using dma, transfer is complete now */ if (dma) { @@ -807,8 +806,6 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) rq->cmd_flags |= REQ_FAILED; uptodate = 0; } - cdrom_end_request(drive, uptodate); - return ide_stopped; } else if (!blk_pc_request(rq)) { ide_cd_request_sense_fixup(drive, rq); /* complain if we still have data left to transfer */ @@ -820,17 +817,16 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) } /* check which way to transfer data */ - if (ide_cd_check_ireason(drive, rq, len, ireason, write)) - return ide_stopped; + rc = ide_cd_check_ireason(drive, rq, len, ireason, write); + if (rc) + goto out_end; if (blk_fs_request(rq)) { if (write == 0) { int nskip; - if (ide_cd_check_transfer_size(drive, len)) { - cdrom_end_request(drive, 0); - return ide_stopped; - } + if (ide_cd_check_transfer_size(drive, len)) + goto out_end; /* * First, figure out if we need to bit-bucket @@ -923,7 +919,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) else rq->data += blen; } - if (!write && blk_sense_request(rq)) + if (sense && write == 0) rq->sense_len += blen; } @@ -944,7 +940,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) return ide_started; out_end: - if (blk_pc_request(rq)) { + if (blk_pc_request(rq) && rc == 0) { unsigned int dlen = rq->data_len; if (dma) @@ -956,6 +952,8 @@ out_end: hwif->rq = NULL; } else { cdrom_end_request(drive, uptodate); + if (sense && rc == 2) + ide_error(drive, "request sense failure", stat); } return ide_stopped; } From f63174e7a7ba3afa7f53e61c59b3f1ca5d88f3fb Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:04 +0200 Subject: [PATCH 246/478] ide-cd: remove cdrom_end_request() Inline cdrom_end_request() into cdrom_newpc_intr() and ide_cd_do_request(). There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 81 +++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 4c32e8db55c3..c859eafe759b 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -265,34 +265,6 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) cdrom_analyze_sense_data(drive, NULL, sense); } -static void cdrom_end_request(ide_drive_t *drive, int uptodate) -{ - struct request *rq = drive->hwif->rq; - int nsectors = rq->hard_cur_sectors; - - ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, uptodate: 0x%x, nsectors: %d", - rq->cmd[0], uptodate, nsectors); - - if (blk_sense_request(rq) && uptodate) - ide_cd_complete_failed_rq(drive, rq); - - if (!rq->current_nr_sectors && blk_fs_request(rq)) - uptodate = 1; - /* make sure it's fully ended */ - if (blk_pc_request(rq)) - nsectors = (rq->data_len + 511) >> 9; - if (!nsectors) - nsectors = 1; - - ide_debug_log(IDE_DBG_FUNC, "uptodate: 0x%x, nsectors: %d", - uptodate, nsectors); - - if (blk_fs_request(rq) == 0 && uptodate <= 0 && rq->errors == 0) - rq->errors = -EIO; - - ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); -} - /* * Returns: * 0: if the request should be continued. @@ -735,7 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) xfer_func_t *xferfunc; ide_expiry_t *expiry = NULL; int dma_error = 0, dma, stat, thislen, uptodate = 0; - int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc; + int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc, nsectors; int sense = blk_sense_request(rq); unsigned int timeout; u16 len; @@ -902,8 +874,14 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) rq->current_nr_sectors -= (blen >> 9); rq->sector += (blen >> 9); - if (rq->current_nr_sectors == 0 && rq->nr_sectors) - cdrom_end_request(drive, 1); + if (rq->current_nr_sectors == 0 && rq->nr_sectors) { + nsectors = rq->hard_cur_sectors; + + if (nsectors == 0) + nsectors = 1; + + ide_complete_rq(drive, 0, nsectors << 9); + } } else { rq->data_len -= blen; @@ -951,7 +929,28 @@ out_end: hwif->rq = NULL; } else { - cdrom_end_request(drive, uptodate); + if (sense && uptodate) + ide_cd_complete_failed_rq(drive, rq); + + if (blk_fs_request(rq)) { + if (rq->current_nr_sectors == 0) + uptodate = 1; + } else { + if (uptodate <= 0 && rq->errors == 0) + rq->errors = -EIO; + } + + /* make sure it's fully ended */ + if (blk_pc_request(rq)) + nsectors = (rq->data_len + 511) >> 9; + else + nsectors = rq->hard_cur_sectors; + + if (nsectors == 0) + nsectors = 1; + + ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); + if (sense && rc == 2) ide_error(drive, "request sense failure", stat); } @@ -1040,7 +1039,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, sector_t block) { struct ide_cmd cmd; - int uptodate = 0; + int uptodate = 0, nsectors; ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, block: %llu", rq->cmd[0], (unsigned long long)block); @@ -1076,7 +1075,21 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, return ide_issue_pc(drive, &cmd); out_end: - cdrom_end_request(drive, uptodate); + if (blk_fs_request(rq)) { + if (rq->current_nr_sectors == 0) + uptodate = 1; + } else { + if (uptodate <= 0 && rq->errors == 0) + rq->errors = -EIO; + } + + nsectors = rq->hard_cur_sectors; + + if (nsectors == 0) + nsectors = 1; + + ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); + return ide_stopped; } From c4c69e21b51005e24e2fc4efc8a73460a5ab7799 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:04 +0200 Subject: [PATCH 247/478] ide-cd: kill whole failed request in ide_cd_do_request() Untangling cdrom_end_request() uncovered an error in completing failed requests in ide_cd_do_request(). Fix it. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index c859eafe759b..978e1c0c1722 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1083,7 +1083,7 @@ out_end: rq->errors = -EIO; } - nsectors = rq->hard_cur_sectors; + nsectors = rq->hard_nr_sectors; if (nsectors == 0) nsectors = 1; From 5ed57ad705d6b58386ac43d2ca1c8fc66aee1101 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:05 +0200 Subject: [PATCH 248/478] ide-cd: cleanup ide_cd_do_request() There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 978e1c0c1722..30113e69c8bb 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1049,8 +1049,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, if (blk_fs_request(rq)) { if (cdrom_start_rw(drive, rq) == ide_stopped || - ide_cd_prepare_rw_request(drive, rq) == ide_stopped) + ide_cd_prepare_rw_request(drive, rq) == ide_stopped) { + if (rq->current_nr_sectors == 0) + uptodate = 1; goto out_end; + } } else if (blk_sense_request(rq) || blk_pc_request(rq) || rq->cmd_type == REQ_TYPE_ATA_PC) { if (!rq->timeout) @@ -1063,6 +1066,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, goto out_end; } else { blk_dump_rq_flags(rq, DRV_NAME " bad flags"); + if (rq->errors == 0) + rq->errors = -EIO; goto out_end; } @@ -1075,14 +1080,6 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, return ide_issue_pc(drive, &cmd); out_end: - if (blk_fs_request(rq)) { - if (rq->current_nr_sectors == 0) - uptodate = 1; - } else { - if (uptodate <= 0 && rq->errors == 0) - rq->errors = -EIO; - } - nsectors = rq->hard_nr_sectors; if (nsectors == 0) From a08915ba594da66145f33a972db578a58b9135f1 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:13 +0200 Subject: [PATCH 249/478] ide-cd: use scatterlists for PIO transfers (fs requests) * Export ide_pio_bytes(). * Add ->last_xfer_len field to struct ide_cmd. * Add ide_cd_error_cmd() helper to ide-cd. * Convert ide-cd to use scatterlists also for PIO transfers (fs requests only for now) and get rid of partial completions (except when the error happens -- which is still subject to change later because looking at ATAPI spec it seems that the device is free to error the whole transfer with setting the Error bit only on the last transfer chunk). * Update ide_cd_{prepare_rw,restore_request,do_request}() accordingly. * Inline ide_cd_restore_request() into cdrom_start_rw(). Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 146 +++++++++++-------------------------- drivers/ide/ide-taskfile.c | 5 +- include/linux/ide.h | 4 + 3 files changed, 50 insertions(+), 105 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 30113e69c8bb..5f15859c2c73 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -539,64 +539,12 @@ static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive, { ide_debug_log(IDE_DBG_RQ, "rq->cmd_flags: 0x%x", rq->cmd_flags); - if (rq_data_dir(rq) == READ) { - unsigned short sectors_per_frame = - queue_hardsect_size(drive->queue) >> SECTOR_BITS; - int nskip = rq->sector & (sectors_per_frame - 1); - - /* - * If the requested sector doesn't start on a frame boundary, - * we must adjust the start of the transfer so that it does, - * and remember to skip the first few sectors. - * - * If the rq->current_nr_sectors field is larger than the size - * of the buffer, it will mean that we're to skip a number of - * sectors equal to the amount by which rq->current_nr_sectors - * is larger than the buffer size. - */ - if (nskip > 0) { - /* sanity check... */ - if (rq->current_nr_sectors != - bio_cur_sectors(rq->bio)) { - printk(KERN_ERR PFX "%s: %s: buffer botch (%u)\n", - drive->name, __func__, - rq->current_nr_sectors); - return ide_stopped; - } - rq->current_nr_sectors += nskip; - } - } - /* set up the command */ rq->timeout = ATAPI_WAIT_PC; return ide_started; } -/* - * Fix up a possibly partially-processed request so that we can start it over - * entirely, or even put it back on the request queue. - */ -static void ide_cd_restore_request(ide_drive_t *drive, struct request *rq) -{ - - ide_debug_log(IDE_DBG_FUNC, "enter"); - - if (rq->buffer != bio_data(rq->bio)) { - sector_t n = - (rq->buffer - (char *)bio_data(rq->bio)) / SECTOR_SIZE; - - rq->buffer = bio_data(rq->bio); - rq->nr_sectors += n; - rq->sector -= n; - } - rq->current_nr_sectors = bio_cur_sectors(rq->bio); - rq->hard_cur_sectors = rq->current_nr_sectors; - rq->hard_nr_sectors = rq->nr_sectors; - rq->hard_sector = rq->sector; - rq->q->prep_rq_fn(rq->q, rq); -} - static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq) { ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]); @@ -690,6 +638,17 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, return (flags & REQ_FAILED) ? -EIO : 0; } +static void ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) +{ + unsigned int nr_bytes = cmd->nbytes - cmd->nleft; + + if (cmd->tf_flags & IDE_TFLAG_WRITE) + nr_bytes -= cmd->last_xfer_len; + + if (nr_bytes > 0) + ide_complete_rq(drive, 0, nr_bytes); +} + /* * Called from blk_end_request_callback() after the data of the request is * completed and before the request itself is completed. By returning value '1', @@ -703,6 +662,7 @@ static int cdrom_newpc_intr_dummy_cb(struct request *rq) static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; + struct ide_cmd *cmd = &hwif->cmd; struct request *rq = hwif->rq; xfer_func_t *xferfunc; ide_expiry_t *expiry = NULL; @@ -769,11 +729,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) * Otherwise, complete the command normally. */ uptodate = 1; - if (rq->current_nr_sectors > 0) { + if (cmd->nleft > 0) { printk(KERN_ERR PFX "%s: %s: data underrun " - "(%d blocks)\n", - drive->name, __func__, - rq->current_nr_sectors); + "(%u bytes)\n", drive->name, __func__, + cmd->nleft); if (!write) rq->cmd_flags |= REQ_FAILED; uptodate = 0; @@ -795,24 +754,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) if (blk_fs_request(rq)) { if (write == 0) { - int nskip; - if (ide_cd_check_transfer_size(drive, len)) goto out_end; - - /* - * First, figure out if we need to bit-bucket - * any of the leading sectors. - */ - nskip = min_t(int, rq->current_nr_sectors - - bio_cur_sectors(rq->bio), - thislen >> 9); - if (nskip > 0) { - ide_pad_transfer(drive, write, nskip << 9); - rq->current_nr_sectors -= nskip; - thislen -= (nskip << 9); - } } + cmd->last_xfer_len = 0; } if (ireason == 0) { @@ -835,15 +780,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) /* bio backed? */ if (rq->bio) { if (blk_fs_request(rq)) { - ptr = rq->buffer; - blen = rq->current_nr_sectors << 9; + blen = min_t(int, thislen, cmd->nleft); } else { ptr = bio_data(rq->bio); blen = bio_iovec(rq->bio)->bv_len; } } - if (!ptr) { + if ((blk_fs_request(rq) && cmd->nleft == 0) || + (blk_fs_request(rq) == 0 && ptr == NULL)) { if (blk_fs_request(rq) && !write) /* * If the buffers are full, pipe the rest into @@ -863,26 +808,16 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) if (blen > thislen) blen = thislen; - xferfunc(drive, NULL, ptr, blen); + if (blk_fs_request(rq)) { + ide_pio_bytes(drive, cmd, write, blen); + cmd->last_xfer_len += blen; + } else + xferfunc(drive, NULL, ptr, blen); thislen -= blen; len -= blen; - if (blk_fs_request(rq)) { - rq->buffer += blen; - rq->nr_sectors -= (blen >> 9); - rq->current_nr_sectors -= (blen >> 9); - rq->sector += (blen >> 9); - - if (rq->current_nr_sectors == 0 && rq->nr_sectors) { - nsectors = rq->hard_cur_sectors; - - if (nsectors == 0) - nsectors = 1; - - ide_complete_rq(drive, 0, nsectors << 9); - } - } else { + if (blk_fs_request(rq) == 0) { rq->data_len -= blen; /* @@ -933,8 +868,10 @@ out_end: ide_cd_complete_failed_rq(drive, rq); if (blk_fs_request(rq)) { - if (rq->current_nr_sectors == 0) + if (cmd->nleft == 0) uptodate = 1; + if (uptodate == 0) + ide_cd_error_cmd(drive, cmd); } else { if (uptodate <= 0 && rq->errors == 0) rq->errors = -EIO; @@ -944,7 +881,7 @@ out_end: if (blk_pc_request(rq)) nsectors = (rq->data_len + 511) >> 9; else - nsectors = rq->hard_cur_sectors; + nsectors = rq->hard_nr_sectors; if (nsectors == 0) nsectors = 1; @@ -960,9 +897,10 @@ out_end: static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) { struct cdrom_info *cd = drive->driver_data; + struct request_queue *q = drive->queue; int write = rq_data_dir(rq) == WRITE; unsigned short sectors_per_frame = - queue_hardsect_size(drive->queue) >> SECTOR_BITS; + queue_hardsect_size(q) >> SECTOR_BITS; ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, write: 0x%x, " "secs_per_frame: %u", @@ -977,17 +915,16 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) * We may be retrying this request after an error. Fix up any * weirdness which might be present in the request packet. */ - ide_cd_restore_request(drive, rq); + q->prep_rq_fn(q, rq); } - /* use DMA, if possible / writes *must* be hardware frame aligned */ + /* fs requests *must* be hardware frame aligned */ if ((rq->nr_sectors & (sectors_per_frame - 1)) || - (rq->sector & (sectors_per_frame - 1))) { - if (write) - return ide_stopped; - drive->dma = 0; - } else - drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); + (rq->sector & (sectors_per_frame - 1))) + return ide_stopped; + + /* use DMA, if possible */ + drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); if (write) cd->devinfo.media_written = 1; @@ -1050,8 +987,6 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, if (blk_fs_request(rq)) { if (cdrom_start_rw(drive, rq) == ide_stopped || ide_cd_prepare_rw_request(drive, rq) == ide_stopped) { - if (rq->current_nr_sectors == 0) - uptodate = 1; goto out_end; } } else if (blk_sense_request(rq) || blk_pc_request(rq) || @@ -1078,6 +1013,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cmd.rq = rq; + if (blk_fs_request(rq)) { + ide_init_sg_cmd(&cmd, rq->nr_sectors << 9); + ide_map_sg(drive, &cmd); + } + return ide_issue_pc(drive, &cmd); out_end: nsectors = rq->hard_nr_sectors; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 0e333ecf2ad6..a3b7a50562b2 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -188,8 +188,8 @@ static u8 wait_drive_not_busy(ide_drive_t *drive) return stat; } -static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, - unsigned int write, unsigned int len) +void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, + unsigned int write, unsigned int len) { ide_hwif_t *hwif = drive->hwif; struct scatterlist *sg = hwif->sg_table; @@ -243,6 +243,7 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, len -= nr_bytes; } } +EXPORT_SYMBOL_GPL(ide_pio_bytes); static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd, unsigned int write) diff --git a/include/linux/ide.h b/include/linux/ide.h index d5d832271f44..c2841c0c36c8 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -352,6 +352,8 @@ struct ide_cmd { unsigned int nbytes; unsigned int nleft; + unsigned int last_xfer_len; + struct scatterlist *cursg; unsigned int cursg_ofs; @@ -1226,6 +1228,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_cmd *); ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *); +void ide_pio_bytes(ide_drive_t *, struct ide_cmd *, unsigned int, unsigned int); + void ide_finish_cmd(ide_drive_t *, struct ide_cmd *, u8); int ide_raw_taskfile(ide_drive_t *, struct ide_cmd *, u8 *, u16); From 06a449e30135aabb6686c95bf0c42b46d169a3b3 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:13 +0200 Subject: [PATCH 250/478] ide-cd: fix non-SECTOR_SIZE-multiples PIO transfers for fs requests We now support arbitrary number of bytes per-IRQ also for fs requests so remove ide_cd_check_transfer_size() and IDE_AFLAG_LIMIT_NFRAMES. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 36 +----------------------------------- include/linux/ide.h | 5 ----- 2 files changed, 1 insertion(+), 40 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 5f15859c2c73..c0cefe5becf3 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -509,31 +509,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq, return -1; } -/* - * Assume that the drive will always provide data in multiples of at least - * SECTOR_SIZE, as it gets hairy to keep track of the transfers otherwise. - */ -static int ide_cd_check_transfer_size(ide_drive_t *drive, int len) -{ - ide_debug_log(IDE_DBG_FUNC, "len: %d", len); - - if ((len % SECTOR_SIZE) == 0) - return 0; - - printk(KERN_ERR PFX "%s: %s: Bad transfer size %d\n", drive->name, - __func__, len); - - if (drive->atapi_flags & IDE_AFLAG_LIMIT_NFRAMES) - printk(KERN_ERR PFX "This drive is not supported by this " - "version of the driver\n"); - else { - printk(KERN_ERR PFX "Trying to limit transfer sizes\n"); - drive->atapi_flags |= IDE_AFLAG_LIMIT_NFRAMES; - } - - return 1; -} - static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive, struct request *rq) { @@ -752,13 +727,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) if (rc) goto out_end; - if (blk_fs_request(rq)) { - if (write == 0) { - if (ide_cd_check_transfer_size(drive, len)) - goto out_end; - } - cmd->last_xfer_len = 0; - } + cmd->last_xfer_len = 0; if (ireason == 0) { write = 1; @@ -1619,9 +1588,6 @@ static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive) #endif static const struct cd_list_entry ide_cd_quirks_list[] = { - /* Limit transfer size per interrupt. */ - { "SAMSUNG CD-ROM SCR-2430", NULL, IDE_AFLAG_LIMIT_NFRAMES }, - { "SAMSUNG CD-ROM SCR-2432", NULL, IDE_AFLAG_LIMIT_NFRAMES }, /* SCR-3231 doesn't support the SET_CD_SPEED command. */ { "SAMSUNG CD-ROM SCR-3231", NULL, IDE_AFLAG_NO_SPEED_SELECT }, /* Old NEC260 (not R) was released before ATAPI 1.2 spec. */ diff --git a/include/linux/ide.h b/include/linux/ide.h index c2841c0c36c8..cb501bf78f7d 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -458,11 +458,6 @@ enum { IDE_AFLAG_TOCADDR_AS_BCD = (1 << 3), /* TOC track numbers are in BCD. */ IDE_AFLAG_TOCTRACKS_AS_BCD = (1 << 4), - /* - * Drive does not provide data in multiples of SECTOR_SIZE - * when more than one interrupt is needed. - */ - IDE_AFLAG_LIMIT_NFRAMES = (1 << 5), /* Saved TOC information is current. */ IDE_AFLAG_TOC_VALID = (1 << 6), /* We think that the drive door is locked. */ From 8652b31ab211b6fe2a4994cc47b61d7038c3489c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:14 +0200 Subject: [PATCH 251/478] ide-cd: merge ide_cd_prepare_rw_request() into cdrom_start_rw() There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index c0cefe5becf3..cf0707fe87e8 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -509,17 +509,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq, return -1; } -static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive, - struct request *rq) -{ - ide_debug_log(IDE_DBG_RQ, "rq->cmd_flags: 0x%x", rq->cmd_flags); - - /* set up the command */ - rq->timeout = ATAPI_WAIT_PC; - - return ide_started; -} - static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq) { ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]); @@ -871,9 +860,9 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) unsigned short sectors_per_frame = queue_hardsect_size(q) >> SECTOR_BITS; - ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, write: 0x%x, " + ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, " "secs_per_frame: %u", - rq->cmd[0], write, sectors_per_frame); + rq->cmd[0], rq->cmd_flags, sectors_per_frame); if (write) { /* disk has become write protected */ @@ -898,6 +887,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) if (write) cd->devinfo.media_written = 1; + rq->timeout = ATAPI_WAIT_PC; + return ide_started; } @@ -954,10 +945,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, blk_dump_rq_flags(rq, "ide_cd_do_request"); if (blk_fs_request(rq)) { - if (cdrom_start_rw(drive, rq) == ide_stopped || - ide_cd_prepare_rw_request(drive, rq) == ide_stopped) { + if (cdrom_start_rw(drive, rq) == ide_stopped) goto out_end; - } } else if (blk_sense_request(rq) || blk_pc_request(rq) || rq->cmd_type == REQ_TYPE_ATA_PC) { if (!rq->timeout) From c7ec89994fec4353d5b4251213bdfa7b1a68c26b Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:15 +0200 Subject: [PATCH 252/478] ide-cd: use scatterlists for PIO transfers (non-fs requests) (v2) Convert ide-cd to use scatterlists for PIO transfers and get rid of partial completions (except on error) also for non-fs requests. v2: Do not map dataless commands to an sg since it oopses on the virt_to_page() translation check when DEBUG_VIRTUAL is enabled. (from Borislav Petkov, reported/bisected-by Tetsuo Handa). Cc: Tetsuo Handa Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 100 +++++++++++++------------------------------ 1 file changed, 30 insertions(+), 70 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index cf0707fe87e8..7fdfb55011a6 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -509,8 +509,10 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq, return -1; } -static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq) +static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd) { + struct request *rq = cmd->rq; + ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]); /* @@ -518,11 +520,14 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq) * and some drives don't send them. Sigh. */ if (rq->cmd[0] == GPCMD_REQUEST_SENSE && - rq->data_len > 0 && rq->data_len <= 5) - while (rq->data_len > 0) { - *(u8 *)rq->data++ = 0; - --rq->data_len; + cmd->nleft > 0 && cmd->nleft <= 5) { + unsigned int ofs = cmd->nbytes - cmd->nleft; + + while (cmd->nleft > 0) { + *((u8 *)rq->data + ofs++) = 0; + cmd->nleft--; } + } } int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, @@ -613,22 +618,11 @@ static void ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) ide_complete_rq(drive, 0, nr_bytes); } -/* - * Called from blk_end_request_callback() after the data of the request is - * completed and before the request itself is completed. By returning value '1', - * blk_end_request_callback() returns immediately without completing it. - */ -static int cdrom_newpc_intr_dummy_cb(struct request *rq) -{ - return 1; -} - static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; struct ide_cmd *cmd = &hwif->cmd; struct request *rq = hwif->rq; - xfer_func_t *xferfunc; ide_expiry_t *expiry = NULL; int dma_error = 0, dma, stat, thislen, uptodate = 0; int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc, nsectors; @@ -678,7 +672,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) ide_read_bcount_and_ireason(drive, &len, &ireason); - thislen = blk_fs_request(rq) ? len : rq->data_len; + thislen = blk_fs_request(rq) ? len : cmd->nleft; if (thislen > len) thislen = len; @@ -702,9 +696,9 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) uptodate = 0; } } else if (!blk_pc_request(rq)) { - ide_cd_request_sense_fixup(drive, rq); + ide_cd_request_sense_fixup(drive, cmd); /* complain if we still have data left to transfer */ - uptodate = rq->data_len ? 0 : 1; + uptodate = cmd->nleft ? 0 : 1; if (uptodate == 0) rq->cmd_flags |= REQ_FAILED; } @@ -718,35 +712,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) cmd->last_xfer_len = 0; - if (ireason == 0) { - write = 1; - xferfunc = hwif->tp_ops->output_data; - } else { - write = 0; - xferfunc = hwif->tp_ops->input_data; - } - ide_debug_log(IDE_DBG_PC, "data transfer, rq->cmd_type: 0x%x, " "ireason: 0x%x", rq->cmd_type, ireason); /* transfer data */ while (thislen > 0) { - u8 *ptr = blk_fs_request(rq) ? NULL : rq->data; - int blen = rq->data_len; + int blen = min_t(int, thislen, cmd->nleft); - /* bio backed? */ - if (rq->bio) { - if (blk_fs_request(rq)) { - blen = min_t(int, thislen, cmd->nleft); - } else { - ptr = bio_data(rq->bio); - blen = bio_iovec(rq->bio)->bv_len; - } - } - - if ((blk_fs_request(rq) && cmd->nleft == 0) || - (blk_fs_request(rq) == 0 && ptr == NULL)) { + if (cmd->nleft == 0) { if (blk_fs_request(rq) && !write) /* * If the buffers are full, pipe the rest into @@ -763,33 +737,12 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) break; } - if (blen > thislen) - blen = thislen; - - if (blk_fs_request(rq)) { - ide_pio_bytes(drive, cmd, write, blen); - cmd->last_xfer_len += blen; - } else - xferfunc(drive, NULL, ptr, blen); + ide_pio_bytes(drive, cmd, write, blen); + cmd->last_xfer_len += blen; thislen -= blen; len -= blen; - if (blk_fs_request(rq) == 0) { - rq->data_len -= blen; - - /* - * The request can't be completed until DRQ is cleared. - * So complete the data, but don't complete the request - * using the dummy function for the callback feature - * of blk_end_request_callback(). - */ - if (rq->bio) - blk_end_request_callback(rq, 0, blen, - cdrom_newpc_intr_dummy_cb); - else - rq->data += blen; - } if (sense && write == 0) rq->sense_len += blen; } @@ -814,8 +767,7 @@ out_end: if (blk_pc_request(rq) && rc == 0) { unsigned int dlen = rq->data_len; - if (dma) - rq->data_len = 0; + rq->data_len = 0; if (blk_end_request(rq, 0, dlen)) BUG(); @@ -828,13 +780,14 @@ out_end: if (blk_fs_request(rq)) { if (cmd->nleft == 0) uptodate = 1; - if (uptodate == 0) - ide_cd_error_cmd(drive, cmd); } else { if (uptodate <= 0 && rq->errors == 0) rq->errors = -EIO; } + if (uptodate == 0) + ide_cd_error_cmd(drive, cmd); + /* make sure it's fully ended */ if (blk_pc_request(rq)) nsectors = (rq->data_len + 511) >> 9; @@ -844,6 +797,12 @@ out_end: if (nsectors == 0) nsectors = 1; + if (blk_fs_request(rq) == 0) { + rq->data_len -= (cmd->nbytes - cmd->nleft); + if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE)) + rq->data_len += cmd->last_xfer_len; + } + ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); if (sense && rc == 2) @@ -971,8 +930,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cmd.rq = rq; - if (blk_fs_request(rq)) { - ide_init_sg_cmd(&cmd, rq->nr_sectors << 9); + if (blk_fs_request(rq) || rq->data_len) { + ide_init_sg_cmd(&cmd, blk_fs_request(rq) ? (rq->nr_sectors << 9) + : rq->data_len); ide_map_sg(drive, &cmd); } From 4a3d8cf48c7baf3439aed06c847cd4562adfc468 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:15 +0200 Subject: [PATCH 253/478] ide-cd: use common completion path for DMA requests in cdrom_newpc_intr() Use the following facts: - rq->nr_sectors should now be always equal to (non-zero) rq->hard_nr_sectors for fs requests - REQ_TYPE_ATA_PC requests have never bio attached to them - rq->hard_nr_sectors == 0 for REQ_TYPE_ATA_PC requests - DMA is used only for fs, pc and REQ_TYPE_ATA_PC requests - 'uptodate' is ignored for pc requests ('rc == 0' case) and use the common completion path also for DMA requests. There should be no functional changes caused by this patch Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 7fdfb55011a6..11d8d5b870b7 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -657,16 +657,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) if (dma) { if (dma_error) return ide_error(drive, "dma error", stat); - if (blk_fs_request(rq)) { - ide_complete_rq(drive, 0, rq->nr_sectors - ? (rq->nr_sectors << 9) : ide_rq_bytes(rq)); - return ide_stopped; - } else if (rq->cmd_type == REQ_TYPE_ATA_PC && !rq->bio) { - ide_complete_rq(drive, 0, 512); - return ide_stopped; - } - if (blk_pc_request(rq) == 0 && uptodate == 0) - rq->cmd_flags |= REQ_FAILED; + uptodate = 1; goto out_end; } From 14fa91ccbafa02a71cfb53f9c830b8c0c65119d0 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:16 +0200 Subject: [PATCH 254/478] ide-cd: unify transfer padding in cdrom_newpc_intr() * 'thislen' is always <= cmd->nleft for non-fs requests so the transfer padding inside the 'while (thislen > 0)' loop can happen only for fs requests -- then move it out of the loop and unify with the transfer padding for non-fs requests ('thislen' == 'len' for fs requests). * blk_dump_rq_flags() dumps all request flags so it is enough to pass only the function name to it. * Update my Copyrights while at it. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 11d8d5b870b7..0201201c6e48 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -4,7 +4,7 @@ * Copyright (C) 1994-1996 Scott Snyder * Copyright (C) 1996-1998 Erik Andersen * Copyright (C) 1998-2000 Jens Axboe - * Copyright (C) 2005, 2007 Bartlomiej Zolnierkiewicz + * Copyright (C) 2005, 2007-2009 Bartlomiej Zolnierkiewicz * * May be copied or modified under the terms of the GNU General Public * License. See linux/COPYING for more information. @@ -711,22 +711,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) while (thislen > 0) { int blen = min_t(int, thislen, cmd->nleft); - if (cmd->nleft == 0) { - if (blk_fs_request(rq) && !write) - /* - * If the buffers are full, pipe the rest into - * oblivion. - */ - ide_pad_transfer(drive, 0, thislen); - else { - printk(KERN_ERR PFX "%s: confused, missing data\n", - drive->name); - blk_dump_rq_flags(rq, rq_data_dir(rq) - ? "cdrom_newpc_intr, write" - : "cdrom_newpc_intr, read"); - } + if (cmd->nleft == 0) break; - } ide_pio_bytes(drive, cmd, write, blen); cmd->last_xfer_len += blen; @@ -739,8 +725,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) } /* pad, if necessary */ - if (!blk_fs_request(rq) && len > 0) - ide_pad_transfer(drive, write, len); + if (len > 0) { + if (blk_fs_request(rq) == 0 || write == 0) + ide_pad_transfer(drive, write, len); + else { + printk(KERN_ERR PFX "%s: confused, missing data\n", + drive->name); + blk_dump_rq_flags(rq, "cdrom_newpc_intr"); + } + } if (blk_pc_request(rq)) { timeout = rq->timeout; From e698ea83a8531a6740dc657329dcf0728392d6ac Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:16 +0200 Subject: [PATCH 255/478] ide-cd: minor ide_cdrom_setup() cleanup Cache drive->queue in local variable and use max(). There should be no functional changes caused by this patch. Acked-by: Sergei Shtylyov Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 0201201c6e48..5319e7a73708 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1581,18 +1581,18 @@ static int ide_cdrom_setup(ide_drive_t *drive) { struct cdrom_info *cd = drive->driver_data; struct cdrom_device_info *cdi = &cd->devinfo; + struct request_queue *q = drive->queue; u16 *id = drive->id; char *fw_rev = (char *)&id[ATA_ID_FW_REV]; int nslots; ide_debug_log(IDE_DBG_PROBE, "enter"); - blk_queue_prep_rq(drive->queue, ide_cdrom_prep_fn); - blk_queue_dma_alignment(drive->queue, 31); - blk_queue_update_dma_pad(drive->queue, 15); - drive->queue->unplug_delay = (1 * HZ) / 1000; - if (!drive->queue->unplug_delay) - drive->queue->unplug_delay = 1; + blk_queue_prep_rq(q, ide_cdrom_prep_fn); + blk_queue_dma_alignment(q, 31); + blk_queue_update_dma_pad(q, 15); + + q->unplug_delay = max((1 * HZ) / 1000, 1); drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED; drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id); @@ -1610,8 +1610,7 @@ static int ide_cdrom_setup(ide_drive_t *drive) nslots = ide_cdrom_probe_capabilities(drive); - /* set correct block size */ - blk_queue_hardsect_size(drive->queue, CD_FRAMESIZE); + blk_queue_hardsect_size(q, CD_FRAMESIZE); if (ide_cdrom_register(drive, nslots)) { printk(KERN_ERR PFX "%s: %s failed to register device with the" From 35c9b4daf4c94b30e5cede597d98016ebf31b5ad Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:19 +0200 Subject: [PATCH 256/478] ide: add ->dma_clear method and remove ->dma_timeout one All custom ->dma_timeout implementations call the generic one thus it is possible to have only an optional method for resetting DMA engine instead: * Add ->dma_clear method and convert hpt366, pdc202xx_old and sl82c105 host drivers to use it. * Always use ide_dma_timeout() in ide_dma_timeout_retry() and remove ->dma_timeout method. * Make ide_dma_timeout() static. There should be no functional changes caused by this patch. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/alim15x3.c | 1 - drivers/ide/au1xxx-ide.c | 1 - drivers/ide/cmd64x.c | 3 --- drivers/ide/cs5536.c | 1 - drivers/ide/hpt366.c | 10 +--------- drivers/ide/icside.c | 1 - drivers/ide/ide-dma-sff.c | 3 +-- drivers/ide/ide-dma.c | 10 ++++++---- drivers/ide/it821x.c | 3 +-- drivers/ide/ns87415.c | 1 - drivers/ide/pdc202xx_old.c | 10 ++-------- drivers/ide/pmac.c | 1 - drivers/ide/sc1200.c | 1 - drivers/ide/scc_pata.c | 1 - drivers/ide/sgiioc4.c | 1 - drivers/ide/siimage.c | 1 - drivers/ide/sl82c105.c | 7 +++---- drivers/ide/tc86c001.c | 1 - drivers/ide/trm290.c | 1 - drivers/ide/tx4939ide.c | 1 - include/linux/ide.h | 4 ++-- 21 files changed, 16 insertions(+), 47 deletions(-) diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index d516168464fc..d3faf0b97f42 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -509,7 +509,6 @@ static const struct ide_dma_ops ali_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index d3a9d6c15328..0c08c5e01f2a 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -353,7 +353,6 @@ static const struct ide_dma_ops au1xxx_dma_ops = { .dma_end = auide_dma_end, .dma_test_irq = auide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, - .dma_timeout = ide_dma_timeout, }; static int auide_ddma_init(ide_hwif_t *hwif, const struct ide_port_info *d) diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c index bf0e3f470824..f0a49d2ff711 100644 --- a/drivers/ide/cmd64x.c +++ b/drivers/ide/cmd64x.c @@ -384,7 +384,6 @@ static const struct ide_dma_ops cmd64x_dma_ops = { .dma_test_irq = cmd64x_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; @@ -396,7 +395,6 @@ static const struct ide_dma_ops cmd646_rev1_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; @@ -408,7 +406,6 @@ static const struct ide_dma_ops cmd648_dma_ops = { .dma_test_irq = cmd648_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c index d5dcf4899607..353a35bbba63 100644 --- a/drivers/ide/cs5536.c +++ b/drivers/ide/cs5536.c @@ -236,7 +236,6 @@ static const struct ide_dma_ops cs5536_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, }; static const struct ide_port_info cs5536_info = { diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index dbaf184ed9c5..a0eb87f59134 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -835,12 +835,6 @@ static int hpt370_dma_end(ide_drive_t *drive) return ide_dma_end(drive); } -static void hpt370_dma_timeout(ide_drive_t *drive) -{ - hpt370_irq_timeout(drive); - ide_dma_timeout(drive); -} - /* returns 1 if DMA IRQ issued, 0 otherwise */ static int hpt374_dma_test_irq(ide_drive_t *drive) { @@ -1423,7 +1417,6 @@ static const struct ide_dma_ops hpt37x_dma_ops = { .dma_test_irq = hpt374_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; @@ -1435,7 +1428,7 @@ static const struct ide_dma_ops hpt370_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = hpt370_dma_timeout, + .dma_clear = hpt370_irq_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; @@ -1447,7 +1440,6 @@ static const struct ide_dma_ops hpt36x_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = hpt366_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c index 51ce404fe532..f069f122ee6e 100644 --- a/drivers/ide/icside.c +++ b/drivers/ide/icside.c @@ -377,7 +377,6 @@ static const struct ide_dma_ops icside_v6_dma_ops = { .dma_start = icside_dma_start, .dma_end = icside_dma_end, .dma_test_irq = icside_dma_test_irq, - .dma_timeout = ide_dma_timeout, .dma_lost_irq = ide_dma_lost_irq, }; #else diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c index 75a9ea2e4c82..7836d7e03fff 100644 --- a/drivers/ide/ide-dma-sff.c +++ b/drivers/ide/ide-dma-sff.c @@ -338,9 +338,8 @@ const struct ide_dma_ops sff_dma_ops = { .dma_start = ide_dma_start, .dma_end = ide_dma_end, .dma_test_irq = ide_dma_test_irq, - .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_lost_irq = ide_dma_lost_irq, + .dma_timer_expiry = ide_dma_sff_timer_expiry, .dma_sff_read_status = ide_dma_sff_read_status, }; EXPORT_SYMBOL_GPL(sff_dma_ops); diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 3dbf80c15491..dc5d9bc4ced0 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -460,7 +460,7 @@ void ide_dma_lost_irq(ide_drive_t *drive) } EXPORT_SYMBOL_GPL(ide_dma_lost_irq); -void ide_dma_timeout(ide_drive_t *drive) +static void ide_dma_timeout(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; @@ -473,7 +473,6 @@ void ide_dma_timeout(ide_drive_t *drive) hwif->dma_ops->dma_end(drive); } -EXPORT_SYMBOL_GPL(ide_dma_timeout); /* * un-busy the port etc, and clear any pending DMA status. we want to @@ -483,6 +482,7 @@ EXPORT_SYMBOL_GPL(ide_dma_timeout); ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { ide_hwif_t *hwif = drive->hwif; + const struct ide_dma_ops *dma_ops = hwif->dma_ops; struct request *rq; ide_startstop_t ret = ide_stopped; @@ -492,12 +492,14 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) if (error < 0) { printk(KERN_WARNING "%s: DMA timeout error\n", drive->name); - (void)hwif->dma_ops->dma_end(drive); + (void)dma_ops->dma_end(drive); ret = ide_error(drive, "dma timeout error", hwif->tp_ops->read_status(hwif)); } else { printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name); - hwif->dma_ops->dma_timeout(drive); + if (dma_ops->dma_clear) + dma_ops->dma_clear(drive); + ide_dma_timeout(drive); } /* diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c index 0d4ac65cf949..51aa745246dc 100644 --- a/drivers/ide/it821x.c +++ b/drivers/ide/it821x.c @@ -511,9 +511,8 @@ static struct ide_dma_ops it821x_pass_through_dma_ops = { .dma_start = it821x_dma_start, .dma_end = it821x_dma_end, .dma_test_irq = ide_dma_test_irq, - .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_lost_irq = ide_dma_lost_irq, + .dma_timer_expiry = ide_dma_sff_timer_expiry, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 7b65fe5bf449..9039a373020f 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -306,7 +306,6 @@ static const struct ide_dma_ops ns87415_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = superio_dma_sff_read_status, }; diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index f7536d1943f7..248a54bd2386 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -258,12 +258,6 @@ static void pdc202xx_dma_lost_irq(ide_drive_t *drive) ide_dma_lost_irq(drive); } -static void pdc202xx_dma_timeout(ide_drive_t *drive) -{ - pdc202xx_reset(drive); - ide_dma_timeout(drive); -} - static int init_chipset_pdc202xx(struct pci_dev *dev) { unsigned long dmabase = pci_resource_start(dev, 4); @@ -336,7 +330,7 @@ static const struct ide_dma_ops pdc20246_dma_ops = { .dma_test_irq = pdc202xx_dma_test_irq, .dma_lost_irq = pdc202xx_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = pdc202xx_dma_timeout, + .dma_clear = pdc202xx_reset, .dma_sff_read_status = ide_dma_sff_read_status, }; @@ -348,7 +342,7 @@ static const struct ide_dma_ops pdc2026x_dma_ops = { .dma_test_irq = pdc202xx_dma_test_irq, .dma_lost_irq = pdc202xx_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = pdc202xx_dma_timeout, + .dma_clear = pdc202xx_reset, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 2bfcfedaa076..d15cc46a66e3 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -1650,7 +1650,6 @@ static const struct ide_dma_ops pmac_dma_ops = { .dma_start = pmac_ide_dma_start, .dma_end = pmac_ide_dma_end, .dma_test_irq = pmac_ide_dma_test_irq, - .dma_timeout = ide_dma_timeout, .dma_lost_irq = pmac_ide_dma_lost_irq, }; diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index 1c3a82914999..371549d18a01 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -291,7 +291,6 @@ static const struct ide_dma_ops sc1200_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 0cc137cfe76d..64534d150b0c 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -872,7 +872,6 @@ static const struct ide_dma_ops scc_dma_ops = { .dma_end = scc_dma_end, .dma_test_irq = scc_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, - .dma_timeout = ide_dma_timeout, .dma_timer_expiry = ide_dma_sff_timer_expiry, .dma_sff_read_status = scc_dma_sff_read_status, }; diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index b12de8346c73..44df0c750bab 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -533,7 +533,6 @@ static const struct ide_dma_ops sgiioc4_dma_ops = { .dma_end = sgiioc4_dma_end, .dma_test_irq = sgiioc4_dma_test_irq, .dma_lost_irq = sgiioc4_dma_lost_irq, - .dma_timeout = ide_dma_timeout, }; static const struct ide_port_info sgiioc4_port_info __devinitconst = { diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c index 075cb1243b2a..e4973cd1fba9 100644 --- a/drivers/ide/siimage.c +++ b/drivers/ide/siimage.c @@ -715,7 +715,6 @@ static const struct ide_dma_ops sil_dma_ops = { .dma_end = ide_dma_end, .dma_test_irq = siimage_dma_test_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_lost_irq = ide_dma_lost_irq, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c index d25137b04e7a..d6f8977191c8 100644 --- a/drivers/ide/sl82c105.c +++ b/drivers/ide/sl82c105.c @@ -189,14 +189,13 @@ static void sl82c105_dma_start(ide_drive_t *drive) ide_dma_start(drive); } -static void sl82c105_dma_timeout(ide_drive_t *drive) +static void sl82c105_dma_clear(ide_drive_t *drive) { struct pci_dev *dev = to_pci_dev(drive->hwif->dev); - DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name)); + DBG(("sl82c105_dma_clear(drive:%s)\n", drive->name)); sl82c105_reset_host(dev); - ide_dma_timeout(drive); } static int sl82c105_dma_end(ide_drive_t *drive) @@ -298,7 +297,7 @@ static const struct ide_dma_ops sl82c105_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = sl82c105_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = sl82c105_dma_timeout, + .dma_clear = sl82c105_dma_clear, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c index 427d4b3c2c63..b4cf42dc8a6f 100644 --- a/drivers/ide/tc86c001.c +++ b/drivers/ide/tc86c001.c @@ -187,7 +187,6 @@ static const struct ide_dma_ops tc86c001_dma_ops = { .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c index ed1496845a93..d6a950828e9f 100644 --- a/drivers/ide/trm290.c +++ b/drivers/ide/trm290.c @@ -314,7 +314,6 @@ static struct ide_dma_ops trm290_dma_ops = { .dma_end = trm290_dma_end, .dma_test_irq = trm290_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, - .dma_timeout = ide_dma_timeout, }; static const struct ide_port_info trm290_chipset __devinitdata = { diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index e0e0a803dde3..53f99853b065 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -632,7 +632,6 @@ static const struct ide_dma_ops tx4939ide_dma_ops = { .dma_test_irq = tx4939ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_timeout = ide_dma_timeout, .dma_sff_read_status = tx4939ide_dma_sff_read_status, }; diff --git a/include/linux/ide.h b/include/linux/ide.h index cb501bf78f7d..d3035f2f1250 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -716,8 +716,9 @@ struct ide_dma_ops { int (*dma_end)(struct ide_drive_s *); int (*dma_test_irq)(struct ide_drive_s *); void (*dma_lost_irq)(struct ide_drive_s *); + /* below ones are optional */ int (*dma_timer_expiry)(struct ide_drive_s *); - void (*dma_timeout)(struct ide_drive_s *); + void (*dma_clear)(struct ide_drive_s *); /* * The following method is optional and only required to be * implemented for the SFF-8038i compatible controllers. @@ -1461,7 +1462,6 @@ static inline int config_drive_for_dma(ide_drive_t *drive) { return 0; } #endif /* CONFIG_BLK_DEV_IDEDMA_SFF */ void ide_dma_lost_irq(ide_drive_t *); -void ide_dma_timeout(ide_drive_t *); ide_startstop_t ide_dma_timeout_retry(ide_drive_t *, int); #else From 1cee52de28aa687760ad262ad0834d1bf6c6d2ac Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:19 +0200 Subject: [PATCH 257/478] ide: inline ide_dma_timeout() into ide_dma_timeout_retry() Since ide_dma_timeout() is only used by ide_dma_timeout_retry() inline it there. There should be no functional changes caused by this patch. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-dma.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index dc5d9bc4ced0..4e2005071113 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -460,20 +460,6 @@ void ide_dma_lost_irq(ide_drive_t *drive) } EXPORT_SYMBOL_GPL(ide_dma_lost_irq); -static void ide_dma_timeout(ide_drive_t *drive) -{ - ide_hwif_t *hwif = drive->hwif; - - printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name); - - if (hwif->dma_ops->dma_test_irq(drive)) - return; - - ide_dump_status(drive, "DMA timeout", hwif->tp_ops->read_status(hwif)); - - hwif->dma_ops->dma_end(drive); -} - /* * un-busy the port etc, and clear any pending DMA status. we want to * retry the current request in pio mode instead of risking tossing it @@ -499,7 +485,12 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name); if (dma_ops->dma_clear) dma_ops->dma_clear(drive); - ide_dma_timeout(drive); + printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name); + if (dma_ops->dma_test_irq(drive) == 0) { + ide_dump_status(drive, "DMA timeout", + hwif->tp_ops->read_status(hwif)); + (void)dma_ops->dma_end(drive); + } } /* From 4453011f959a5f5c6c7a33aea54fe17f5e43a867 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:20 +0200 Subject: [PATCH 258/478] ide: destroy DMA mappings after ending DMA (v2) Move ide_destroy_dmatable() call out from ->dma_end method to {ide_pc,cdrom_newpc,ide_dma}_intr(), ide_dma_timeout_retry() and sgiioc4_resetproc(). This causes minor/safe behavior changes w.r.t.: * cmd64x.c::cmd64{8,x}_dma_end() * cs5536.c::cs5536_dma_end() * icside.c::icside_dma_end() * it821x.c::it821x_dma_end() * scc_pata.c::__scc_dma_end() * sl82c105.c::sl82c105_dma_end() * tx4939ide.c::tx4939ide_dma_end() v2: * Fix build for CONFIG_BLK_DEV_IDEDMA=n (reported by Randy Dunlap). Cc: Randy Dunlap Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/au1xxx-ide.c | 2 -- drivers/ide/cmd64x.c | 2 -- drivers/ide/icside.c | 3 --- drivers/ide/ide-atapi.c | 7 +++++-- drivers/ide/ide-cd.c | 1 + drivers/ide/ide-dma-sff.c | 2 -- drivers/ide/ide-dma.c | 3 +++ drivers/ide/ns87415.c | 2 -- drivers/ide/pmac.c | 2 -- drivers/ide/sc1200.c | 1 - drivers/ide/scc_pata.c | 2 -- drivers/ide/sgiioc4.c | 2 +- drivers/ide/trm290.c | 3 +-- drivers/ide/tx4939ide.c | 4 +--- include/linux/ide.h | 1 + 15 files changed, 13 insertions(+), 24 deletions(-) diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index 0c08c5e01f2a..ba2a211758a9 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -280,8 +280,6 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) static int auide_dma_end(ide_drive_t *drive) { - ide_destroy_dmatable(drive); - return 0; } diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c index f0a49d2ff711..f2edf280ef8b 100644 --- a/drivers/ide/cmd64x.c +++ b/drivers/ide/cmd64x.c @@ -327,8 +327,6 @@ static int cmd646_1_dma_end(ide_drive_t *drive) outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD); /* clear the INTR & ERROR bits */ outb(dma_stat | 6, hwif->dma_base + ATA_DMA_STATUS); - /* and free any DMA resources */ - ide_destroy_dmatable(drive); /* verify good DMA status */ return (dma_stat & 7) != 4; } diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c index f069f122ee6e..9bf57d7c8e57 100644 --- a/drivers/ide/icside.c +++ b/drivers/ide/icside.c @@ -291,9 +291,6 @@ static int icside_dma_end(ide_drive_t *drive) disable_dma(ec->dma); - /* Teardown mappings after DMA has completed. */ - ide_destroy_dmatable(drive); - return get_dma_residue(ec->dma) != 0; } diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index f591166d2c93..1481f71f8173 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -342,8 +342,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) stat = tp_ops->read_status(hwif); if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - if (hwif->dma_ops->dma_end(drive) || - (drive->media == ide_tape && (stat & ATA_ERR))) { + int rc = hwif->dma_ops->dma_end(drive); + + ide_destroy_dmatable(drive); + + if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) { if (drive->media == ide_floppy) printk(KERN_ERR "%s: DMA %s error\n", drive->name, rq_data_dir(pc->rq) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 5319e7a73708..4a0d66ee9547 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -639,6 +639,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) if (dma) { drive->dma = 0; dma_error = hwif->dma_ops->dma_end(drive); + ide_destroy_dmatable(drive); if (dma_error) { printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name, write ? "write" : "read"); diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c index 7836d7e03fff..f8adbb5eb339 100644 --- a/drivers/ide/ide-dma-sff.c +++ b/drivers/ide/ide-dma-sff.c @@ -310,8 +310,6 @@ int ide_dma_end(ide_drive_t *drive) /* clear INTR & ERROR bits */ ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR); - /* purge DMA mappings */ - ide_destroy_dmatable(drive); wmb(); /* verify good DMA status */ diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 4e2005071113..b430898bbcd6 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -92,6 +92,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive) u8 stat = 0, dma_stat = 0; dma_stat = hwif->dma_ops->dma_end(drive); + ide_destroy_dmatable(drive); stat = hwif->tp_ops->read_status(hwif); if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) { @@ -479,6 +480,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) if (error < 0) { printk(KERN_WARNING "%s: DMA timeout error\n", drive->name); (void)dma_ops->dma_end(drive); + ide_destroy_dmatable(drive); ret = ide_error(drive, "dma timeout error", hwif->tp_ops->read_status(hwif)); } else { @@ -490,6 +492,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) ide_dump_status(drive, "DMA timeout", hwif->tp_ops->read_status(hwif)); (void)dma_ops->dma_end(drive); + ide_destroy_dmatable(drive); } } diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 9039a373020f..9ad71a74f93f 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -210,8 +210,6 @@ static int ns87415_dma_end(ide_drive_t *drive) /* from ERRATA: clear the INTR & ERROR bits */ dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD); outb(dma_cmd | 6, hwif->dma_base + ATA_DMA_CMD); - /* and free any DMA resources */ - ide_destroy_dmatable(drive); /* verify good DMA status */ return (dma_stat & 7) != 4; } diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index d15cc46a66e3..5643a8b957bf 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -1562,8 +1562,6 @@ pmac_ide_dma_end (ide_drive_t *drive) dstat = readl(&dma->status); writel(((RUN|WAKE|DEAD) << 16), &dma->control); - ide_destroy_dmatable(drive); - /* verify good dma status. we don't check for ACTIVE beeing 0. We should... * in theory, but with ATAPI decices doing buffer underruns, that would * cause us to disable DMA, which isn't what we want diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index 371549d18a01..d9c47034bedd 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -184,7 +184,6 @@ static int sc1200_dma_end(ide_drive_t *drive) outb(inb(dma_base)&~1, dma_base); /* !! DO THIS HERE !! stop DMA */ drive->waiting_for_dma = 0; - ide_destroy_dmatable(drive); /* purge DMA mappings */ return (dma_stat & 7) != 4; /* verify good DMA status */ } diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 64534d150b0c..693536ebe331 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -365,8 +365,6 @@ static int __scc_dma_end(ide_drive_t *drive) dma_stat = scc_dma_sff_read_status(hwif); /* clear the INTR & ERROR bits */ scc_ide_outb(dma_stat | 6, hwif->dma_base + 4); - /* purge DMA mappings */ - ide_destroy_dmatable(drive); /* verify good DMA status */ wmb(); return (dma_stat & 7) != 4 ? (0x10 | dma_stat) : 0; diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index 44df0c750bab..457a762a1f29 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -259,7 +259,6 @@ static int sgiioc4_dma_end(ide_drive_t *drive) } drive->waiting_for_dma = 0; - ide_destroy_dmatable(drive); return dma_stat; } @@ -284,6 +283,7 @@ static void sgiioc4_resetproc(ide_drive_t * drive) { sgiioc4_dma_end(drive); + ide_destroy_dmatable(drive); sgiioc4_clearirq(drive); } diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c index d6a950828e9f..8dd3d8226870 100644 --- a/drivers/ide/trm290.c +++ b/drivers/ide/trm290.c @@ -216,8 +216,7 @@ static int trm290_dma_end(ide_drive_t *drive) u16 status; drive->waiting_for_dma = 0; - /* purge DMA mappings */ - ide_destroy_dmatable(drive); + status = inw(drive->hwif->dma_base + 2); return status != 0x00ff; diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 53f99853b065..f62ced855cf3 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -335,11 +335,9 @@ static int tx4939ide_dma_end(ide_drive_t *drive) /* read and clear the INTR & ERROR bits */ dma_stat = tx4939ide_clear_dma_status(base); - /* purge DMA mappings */ - ide_destroy_dmatable(drive); - /* verify good DMA status */ wmb(); + /* verify good DMA status */ if ((dma_stat & (ATA_DMA_INTR | ATA_DMA_ERR | ATA_DMA_ACTIVE)) == 0 && (ctl & (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST)) == (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST)) diff --git a/include/linux/ide.h b/include/linux/ide.h index d3035f2f1250..b6c4942fde11 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1479,6 +1479,7 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; } static inline int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd) { return 0; } +static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; } #endif /* CONFIG_BLK_DEV_IDEDMA */ #ifdef CONFIG_BLK_DEV_IDEACPI From 5ae5412d9a23b05ab08461b202bad21ad8f6b66d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:20 +0200 Subject: [PATCH 259/478] ide: add ide_dma_prepare() helper * Add ide_dma_prepare() helper. * Convert ide_issue_pc() and do_rw_taskfile() to use it. * Make ide_build_sglist() static. There should be no functional changes caused by this patch. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 18 ++++-------------- drivers/ide/ide-dma.c | 11 ++++++++++- drivers/ide/ide-taskfile.c | 4 +--- include/linux/ide.h | 7 ++++--- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 1481f71f8173..89d2339bdef3 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -638,7 +638,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd) { struct ide_atapi_pc *pc; ide_hwif_t *hwif = drive->hwif; - const struct ide_dma_ops *dma_ops = hwif->dma_ops; ide_expiry_t *expiry = NULL; struct request *rq = hwif->rq; unsigned int timeout; @@ -652,12 +651,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd) expiry = ide_cd_expiry; timeout = ATAPI_WAIT_PC; - if (drive->dma) { - if (ide_build_sglist(drive, cmd)) - drive->dma = !dma_ops->dma_setup(drive, cmd); - else - drive->dma = 0; - } + if (drive->dma) + drive->dma = !ide_dma_prepare(drive, cmd); } else { pc = drive->pc; @@ -675,13 +670,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd) ide_dma_off(drive); } - if ((pc->flags & PC_FLAG_DMA_OK) && - (drive->dev_flags & IDE_DFLAG_USING_DMA)) { - if (ide_build_sglist(drive, cmd)) - drive->dma = !dma_ops->dma_setup(drive, cmd); - else - drive->dma = 0; - } + if (pc->flags & PC_FLAG_DMA_OK) + drive->dma = !ide_dma_prepare(drive, cmd); if (!drive->dma) pc->flags &= ~PC_FLAG_DMA_OK; diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index b430898bbcd6..cf5897f5533f 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -128,7 +128,7 @@ int ide_dma_good_drive(ide_drive_t *drive) * operate in a portable fashion. */ -int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd) +static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; struct scatterlist *sg = hwif->sg_table; @@ -563,3 +563,12 @@ int ide_allocate_dma_engine(ide_hwif_t *hwif) return 0; } EXPORT_SYMBOL_GPL(ide_allocate_dma_engine); + +int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd) +{ + if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 || + ide_build_sglist(drive, cmd) == 0 || + drive->hwif->dma_ops->dma_setup(drive, cmd)) + return 1; + return 0; +} diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index a3b7a50562b2..dba68db629bf 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -100,9 +100,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd) ide_execute_command(drive, cmd, handler, WAIT_WORSTCASE); return ide_started; case ATA_PROT_DMA: - if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 || - ide_build_sglist(drive, cmd) == 0 || - dma_ops->dma_setup(drive, cmd)) + if (ide_dma_prepare(drive, cmd)) return ide_stopped; hwif->expiry = dma_ops->dma_timer_expiry; ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD); diff --git a/include/linux/ide.h b/include/linux/ide.h index b6c4942fde11..78892e2a432c 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1443,7 +1443,8 @@ ide_startstop_t ide_dma_intr(ide_drive_t *); int ide_allocate_dma_engine(ide_hwif_t *); void ide_release_dma_engine(ide_hwif_t *); -int ide_build_sglist(ide_drive_t *, struct ide_cmd *); +int ide_dma_prepare(ide_drive_t *, struct ide_cmd *); + void ide_destroy_dmatable(ide_drive_t *); #ifdef CONFIG_BLK_DEV_IDEDMA_SFF @@ -1477,8 +1478,8 @@ static inline void ide_check_dma_crc(ide_drive_t *drive) { ; } static inline ide_startstop_t ide_dma_intr(ide_drive_t *drive) { return ide_stopped; } static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; } static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; } -static inline int ide_build_sglist(ide_drive_t *drive, - struct ide_cmd *cmd) { return 0; } +static inline int ide_dma_prepare(ide_drive_t *drive, + struct ide_cmd *cmd) { return 1; } static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; } #endif /* CONFIG_BLK_DEV_IDEDMA */ From a6d67ffa7dfe9515d8f2051a76b14c82b748475a Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:20 +0200 Subject: [PATCH 260/478] ns87415: use custom ->dma_{start,end} to handle ns87415_prepare_drive() Use custom ->dma_{start,end} methods to handle ns87415_prepare_drive() there instead of in ->dma_setup method. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ns87415.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 9ad71a74f93f..6e0f372d2f09 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -196,6 +196,12 @@ static void ns87415_selectproc (ide_drive_t *drive) !!(drive->dev_flags & IDE_DFLAG_USING_DMA)); } +static void ns87415_dma_start(ide_drive_t *drive) +{ + ns87415_prepare_drive(drive, 1); + ide_dma_start(drive); +} + static int ns87415_dma_end(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; @@ -210,21 +216,13 @@ static int ns87415_dma_end(ide_drive_t *drive) /* from ERRATA: clear the INTR & ERROR bits */ dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD); outb(dma_cmd | 6, hwif->dma_base + ATA_DMA_CMD); + + ns87415_prepare_drive(drive, 0); + /* verify good DMA status */ return (dma_stat & 7) != 4; } -static int ns87415_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) -{ - /* select DMA xfer */ - ns87415_prepare_drive(drive, 1); - if (ide_dma_setup(drive, cmd) == 0) - return 0; - /* DMA failed: select PIO xfer */ - ns87415_prepare_drive(drive, 0); - return 1; -} - static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif) { struct pci_dev *dev = to_pci_dev(hwif->dev); @@ -298,8 +296,8 @@ static const struct ide_port_ops ns87415_port_ops = { static const struct ide_dma_ops ns87415_dma_ops = { .dma_host_set = ide_dma_host_set, - .dma_setup = ns87415_dma_setup, - .dma_start = ide_dma_start, + .dma_setup = ide_dma_setup, + .dma_start = ns87415_dma_start, .dma_end = ns87415_dma_end, .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, From 7526efaafdc835b8d6b22aa1a302e14651373908 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:21 +0200 Subject: [PATCH 261/478] trm290: use custom ->dma_{start,end} to handle trm290_prepare_drive() Use custom ->dma_{start,end} methods to handle trm290_prepare_drive() there instead of in ->dma_setup method. There should be no functional changes caused by this patch (DMA support is disabled currently in trm290.c). Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/trm290.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c index 8dd3d8226870..b91bb709af40 100644 --- a/drivers/ide/trm290.c +++ b/drivers/ide/trm290.c @@ -184,7 +184,6 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_WRITE) { #ifdef TRM290_NO_DMA_WRITES /* always use PIO for writes */ - trm290_prepare_drive(drive, 0); /* select PIO xfer */ return 1; #endif rw = 1; @@ -195,11 +194,8 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) if (count == 0) { ide_map_sg(drive, cmd); /* try PIO instead of DMA */ - trm290_prepare_drive(drive, 0); /* select PIO xfer */ return 1; } - /* select DMA xfer */ - trm290_prepare_drive(drive, 1); outl(hwif->dmatable_dma | rw, hwif->dma_base); drive->waiting_for_dma = 1; /* start DMA */ @@ -209,6 +205,7 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) static void trm290_dma_start(ide_drive_t *drive) { + trm290_prepare_drive(drive, 1); } static int trm290_dma_end(ide_drive_t *drive) @@ -219,6 +216,8 @@ static int trm290_dma_end(ide_drive_t *drive) status = inw(drive->hwif->dma_base + 2); + trm290_prepare_drive(drive, 0); + return status != 0x00ff; } From 8a4a5738ba499083cf4c5668895efe220b1946d3 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:21 +0200 Subject: [PATCH 262/478] ide: add ->dma_check method * Add (an optional) ->dma_check method for checking if DMA can be used for a given command and fail DMA setup in ide_dma_prepare() if necessary. * Convert alim15x3 and trm290 host drivers to use ->dma_check. * Rename ali15x3_dma_setup() to ali_dma_check() while at it. There should be no functional changes caused by this patch. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/alim15x3.c | 9 +++++---- drivers/ide/ide-dma.c | 5 ++++- drivers/ide/trm290.c | 17 ++++++++++------- include/linux/ide.h | 1 + 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index d3faf0b97f42..537da1cde16d 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -189,20 +189,20 @@ static void ali_set_dma_mode(ide_drive_t *drive, const u8 speed) } /** - * ali15x3_dma_setup - begin a DMA phase + * ali_dma_check - DMA check * @drive: target device * @cmd: command * * Returns 1 if the DMA cannot be performed, zero on success. */ -static int ali15x3_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) +static int ali_dma_check(ide_drive_t *drive, struct ide_cmd *cmd) { if (m5229_revision < 0xC2 && drive->media != ide_disk) { if (cmd->tf_flags & IDE_TFLAG_WRITE) return 1; /* try PIO instead of DMA */ } - return ide_dma_setup(drive, cmd); + return 0; } /** @@ -503,11 +503,12 @@ static const struct ide_port_ops ali_port_ops = { static const struct ide_dma_ops ali_dma_ops = { .dma_host_set = ide_dma_host_set, - .dma_setup = ali15x3_dma_setup, + .dma_setup = ide_dma_setup, .dma_start = ide_dma_start, .dma_end = ide_dma_end, .dma_test_irq = ide_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, + .dma_check = ali_dma_check, .dma_timer_expiry = ide_dma_sff_timer_expiry, .dma_sff_read_status = ide_dma_sff_read_status, }; diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index cf5897f5533f..c0505e2dfc2e 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -566,9 +566,12 @@ EXPORT_SYMBOL_GPL(ide_allocate_dma_engine); int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd) { + const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops; + if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 || + (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) || ide_build_sglist(drive, cmd) == 0 || - drive->hwif->dma_ops->dma_setup(drive, cmd)) + dma_ops->dma_setup(drive, cmd)) return 1; return 0; } diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c index b91bb709af40..1076efd050dc 100644 --- a/drivers/ide/trm290.c +++ b/drivers/ide/trm290.c @@ -176,19 +176,21 @@ static void trm290_selectproc (ide_drive_t *drive) trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA)); } -static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) +static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd) { - ide_hwif_t *hwif = drive->hwif; - unsigned int count, rw; - if (cmd->tf_flags & IDE_TFLAG_WRITE) { #ifdef TRM290_NO_DMA_WRITES /* always use PIO for writes */ return 1; #endif - rw = 1; - } else - rw = 2; + } + return 0; +} + +static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) +{ + ide_hwif_t *hwif = drive->hwif; + unsigned int count, rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 2; count = ide_build_dmatable(drive, cmd); if (count == 0) { @@ -312,6 +314,7 @@ static struct ide_dma_ops trm290_dma_ops = { .dma_end = trm290_dma_end, .dma_test_irq = trm290_dma_test_irq, .dma_lost_irq = ide_dma_lost_irq, + .dma_check = trm290_dma_check, }; static const struct ide_port_info trm290_chipset __devinitdata = { diff --git a/include/linux/ide.h b/include/linux/ide.h index 78892e2a432c..b350667b83ad 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -717,6 +717,7 @@ struct ide_dma_ops { int (*dma_test_irq)(struct ide_drive_s *); void (*dma_lost_irq)(struct ide_drive_s *); /* below ones are optional */ + int (*dma_check)(struct ide_drive_s *, struct ide_cmd *); int (*dma_timer_expiry)(struct ide_drive_s *); void (*dma_clear)(struct ide_drive_s *); /* From 11998b316173f814698f9037ce9179d634d1f423 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:21 +0200 Subject: [PATCH 263/478] ide: move ide_map_sg() call out of ->dma_setup method (take 2) Move ide_map_sg() call from ->dma_setup implementations and ide_destroy_dmatable() one from *_build_dmatable() to ide_dma_prepare(). There should be no functional changes caused by this patch. Sergei: Removed 'use_pio_instead' labels and replaced 'goto' with 'return 0' -- that required no changes to the follow-up patches... Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/au1xxx-ide.c | 9 ++------- drivers/ide/ide-dma-sff.c | 2 -- drivers/ide/ide-dma.c | 8 ++++++-- drivers/ide/pmac.c | 11 +++-------- drivers/ide/scc_pata.c | 4 +--- drivers/ide/sgiioc4.c | 9 ++------- drivers/ide/trm290.c | 5 ++--- drivers/ide/tx4939ide.c | 6 +----- 8 files changed, 17 insertions(+), 37 deletions(-) diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index ba2a211758a9..239643806b97 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -236,7 +236,7 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) if (++count >= PRD_ENTRIES) { printk(KERN_WARNING "%s: DMA table too small\n", drive->name); - goto use_pio_instead; + return 0; } /* Lets enable intr for the last descriptor only */ @@ -272,9 +272,6 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) if (count) return 1; - use_pio_instead: - ide_destroy_dmatable(drive); - return 0; /* revert to PIO for this request */ } @@ -290,10 +287,8 @@ static void auide_dma_start(ide_drive_t *drive ) static int auide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) { - if (auide_build_dmatable(drive, cmd) == 0) { - ide_map_sg(drive, cmd); + if (auide_build_dmatable(drive, cmd) == 0) return 1; - } drive->waiting_for_dma = 1; return 0; diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c index f8adbb5eb339..e10054b827a0 100644 --- a/drivers/ide/ide-dma-sff.c +++ b/drivers/ide/ide-dma-sff.c @@ -166,8 +166,6 @@ use_pio_instead: printk(KERN_ERR "%s: %s\n", drive->name, count ? "DMA table too small" : "empty DMA table?"); - ide_destroy_dmatable(drive); - return 0; /* revert to PIO for this request */ } EXPORT_SYMBOL_GPL(ide_build_dmatable); diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index c0505e2dfc2e..d61f9a8cc18a 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -570,8 +570,12 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd) if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 || (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) || - ide_build_sglist(drive, cmd) == 0 || - dma_ops->dma_setup(drive, cmd)) + ide_build_sglist(drive, cmd) == 0) return 1; + if (dma_ops->dma_setup(drive, cmd)) { + ide_destroy_dmatable(drive); + ide_map_sg(drive, cmd); + return 1; + } return 0; } diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 5643a8b957bf..1df7b7636453 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -1455,7 +1455,7 @@ static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) "switching to PIO on Ohare chipset\n", drive->name); pmif->broken_dma_warn = 1; } - goto use_pio_instead; + return 0; } while (cur_len) { unsigned int tc = (cur_len < 0xfe00)? cur_len: 0xfe00; @@ -1463,7 +1463,7 @@ static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) if (count++ >= MAX_DCMDS) { printk(KERN_WARNING "%s: DMA table too small\n", drive->name); - goto use_pio_instead; + return 0; } st_le16(&table->command, wr? OUTPUT_MORE: INPUT_MORE); st_le16(&table->req_count, tc); @@ -1492,9 +1492,6 @@ static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) printk(KERN_DEBUG "%s: empty DMA table?\n", drive->name); -use_pio_instead: - ide_destroy_dmatable(drive); - return 0; /* revert to PIO for this request */ } @@ -1510,10 +1507,8 @@ static int pmac_ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) u8 unit = drive->dn & 1, ata4 = (pmif->kind == controller_kl_ata4); u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE); - if (pmac_ide_build_dmatable(drive, cmd) == 0) { - ide_map_sg(drive, cmd); + if (pmac_ide_build_dmatable(drive, cmd) == 0) return 1; - } /* Apple adds 60ns to wrDataSetup on reads */ if (ata4 && (pmif->timings[unit] & TR_66_UDMA_EN)) { diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 693536ebe331..2cdf51d8917a 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -321,10 +321,8 @@ static int scc_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) u8 dma_stat; /* fall back to pio! */ - if (ide_build_dmatable(drive, cmd) == 0) { - ide_map_sg(drive, cmd); + if (ide_build_dmatable(drive, cmd) == 0) return 1; - } /* PRD table */ out_be32((void __iomem *)(hwif->dma_base + 8), hwif->dmatable_dma); diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index 457a762a1f29..d90e8c5af0a2 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -442,7 +442,7 @@ static int sgiioc4_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) printk(KERN_WARNING "%s: DMA table too small\n", drive->name); - goto use_pio_instead; + return 0; } else { u32 bcount = 0x10000 - (cur_addr & 0xffff); @@ -477,9 +477,6 @@ static int sgiioc4_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd) return count; } -use_pio_instead: - ide_destroy_dmatable(drive); - return 0; /* revert to PIO for this request */ } @@ -488,11 +485,9 @@ static int sgiioc4_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) int ddir; u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE); - if (sgiioc4_build_dmatable(drive, cmd) == 0) { + if (sgiioc4_build_dmatable(drive, cmd) == 0) /* try PIO instead of DMA */ - ide_map_sg(drive, cmd); return 1; - } if (write) /* Writes TO the IOC4 FROM Main Memory */ diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c index 1076efd050dc..0be27cae27d5 100644 --- a/drivers/ide/trm290.c +++ b/drivers/ide/trm290.c @@ -193,11 +193,10 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) unsigned int count, rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 2; count = ide_build_dmatable(drive, cmd); - if (count == 0) { - ide_map_sg(drive, cmd); + if (count == 0) /* try PIO instead of DMA */ return 1; - } + outl(hwif->dmatable_dma | rw, hwif->dma_base); drive->waiting_for_dma = 1; /* start DMA */ diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index f62ced855cf3..7267c46c07cd 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -279,8 +279,6 @@ use_pio_instead: printk(KERN_ERR "%s: %s\n", drive->name, count ? "DMA table too small" : "empty DMA table?"); - ide_destroy_dmatable(drive); - return 0; /* revert to PIO for this request */ } #else @@ -294,10 +292,8 @@ static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) u8 rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 0 : ATA_DMA_WR; /* fall back to PIO! */ - if (tx4939ide_build_dmatable(drive, cmd) == 0) { - ide_map_sg(drive, cmd); + if (tx4939ide_build_dmatable(drive, cmd) == 0) return 1; - } /* PRD table */ tx4939ide_writel(hwif->dmatable_dma, base, TX4939IDE_PRD_Ptr); From 88b4132e101e60e8fa67996ae3072ab6b71e8500 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:22 +0200 Subject: [PATCH 264/478] ide: set/clear drive->waiting_for_dma flag in the core code Set/clear drive->waiting_for_dma flag in the core code instead of in ->dma_setup and ->dma_end methods. There should be no functional changes caused by this patch. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/au1xxx-ide.c | 6 ------ drivers/ide/cmd64x.c | 1 - drivers/ide/icside.c | 4 ---- drivers/ide/ide-atapi.c | 4 +++- drivers/ide/ide-cd.c | 1 + drivers/ide/ide-dma-sff.c | 3 --- drivers/ide/ide-dma.c | 4 ++++ drivers/ide/ns87415.c | 1 - drivers/ide/pmac.c | 3 --- drivers/ide/sc1200.c | 2 -- drivers/ide/scc_pata.c | 3 +-- drivers/ide/sgiioc4.c | 3 --- drivers/ide/trm290.c | 8 ++------ drivers/ide/tx4939ide.c | 4 ---- 14 files changed, 11 insertions(+), 36 deletions(-) diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index 239643806b97..549bbb2755a8 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -290,7 +290,6 @@ static int auide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) if (auide_build_dmatable(drive, cmd) == 0) return 1; - drive->waiting_for_dma = 1; return 0; } @@ -315,16 +314,11 @@ static void auide_dma_host_set(ide_drive_t *drive, int on) static void auide_ddma_tx_callback(int irq, void *param) { - _auide_hwif *ahwif = (_auide_hwif*)param; - ahwif->drive->waiting_for_dma = 0; } static void auide_ddma_rx_callback(int irq, void *param) { - _auide_hwif *ahwif = (_auide_hwif*)param; - ahwif->drive->waiting_for_dma = 0; } - #endif /* end CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA */ static void auide_init_dbdma_dev(dbdev_tab_t *dev, u32 dev_id, u32 tsize, u32 devwidth, u32 flags) diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c index f2edf280ef8b..80b777e4247b 100644 --- a/drivers/ide/cmd64x.c +++ b/drivers/ide/cmd64x.c @@ -318,7 +318,6 @@ static int cmd646_1_dma_end(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; u8 dma_stat = 0, dma_cmd = 0; - drive->waiting_for_dma = 0; /* get DMA status */ dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS); /* read DMA command state */ diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c index 9bf57d7c8e57..4e16ce68b063 100644 --- a/drivers/ide/icside.c +++ b/drivers/ide/icside.c @@ -287,8 +287,6 @@ static int icside_dma_end(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; struct expansion_card *ec = ECARD_DEV(hwif->dev); - drive->waiting_for_dma = 0; - disable_dma(ec->dma); return get_dma_residue(ec->dma) != 0; @@ -343,8 +341,6 @@ static int icside_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) set_dma_sg(ec->dma, hwif->sg_table, cmd->sg_nents); set_dma_mode(ec->dma, dma_mode); - drive->waiting_for_dma = 1; - return 0; } diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 89d2339bdef3..db6e617790bd 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -342,8 +342,10 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) stat = tp_ops->read_status(hwif); if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - int rc = hwif->dma_ops->dma_end(drive); + int rc; + drive->waiting_for_dma = 0; + rc = hwif->dma_ops->dma_end(drive); ide_destroy_dmatable(drive); if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) { diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 4a0d66ee9547..f5c7bb739f45 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -638,6 +638,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) dma = drive->dma; if (dma) { drive->dma = 0; + drive->waiting_for_dma = 0; dma_error = hwif->dma_ops->dma_end(drive); ide_destroy_dmatable(drive); if (dma_error) { diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c index e10054b827a0..f8c7d0cd2ff8 100644 --- a/drivers/ide/ide-dma-sff.c +++ b/drivers/ide/ide-dma-sff.c @@ -216,7 +216,6 @@ int ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) /* clear INTR & ERROR flags */ ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR); - drive->waiting_for_dma = 1; return 0; } EXPORT_SYMBOL_GPL(ide_dma_setup); @@ -290,8 +289,6 @@ int ide_dma_end(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; u8 dma_stat = 0, dma_cmd = 0, mask; - drive->waiting_for_dma = 0; - /* stop DMA */ if (hwif->host_flags & IDE_HFLAG_MMIO) { dma_cmd = readb((void __iomem *)(hwif->dma_base + ATA_DMA_CMD)); diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index d61f9a8cc18a..4d3102887d9c 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -91,6 +91,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; u8 stat = 0, dma_stat = 0; + drive->waiting_for_dma = 0; dma_stat = hwif->dma_ops->dma_end(drive); ide_destroy_dmatable(drive); stat = hwif->tp_ops->read_status(hwif); @@ -479,6 +480,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) if (error < 0) { printk(KERN_WARNING "%s: DMA timeout error\n", drive->name); + drive->waiting_for_dma = 0; (void)dma_ops->dma_end(drive); ide_destroy_dmatable(drive); ret = ide_error(drive, "dma timeout error", @@ -491,6 +493,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) if (dma_ops->dma_test_irq(drive) == 0) { ide_dump_status(drive, "DMA timeout", hwif->tp_ops->read_status(hwif)); + drive->waiting_for_dma = 0; (void)dma_ops->dma_end(drive); ide_destroy_dmatable(drive); } @@ -577,5 +580,6 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd) ide_map_sg(drive, cmd); return 1; } + drive->waiting_for_dma = 1; return 0; } diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 6e0f372d2f09..cbc1d1110385 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -207,7 +207,6 @@ static int ns87415_dma_end(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; u8 dma_stat = 0, dma_cmd = 0; - drive->waiting_for_dma = 0; dma_stat = hwif->dma_ops->dma_sff_read_status(hwif); /* get DMA command mode */ dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD); diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 1df7b7636453..879c3d8d9f36 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -1517,8 +1517,6 @@ static int pmac_ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) (void)readl(PMAC_IDE_REG(IDE_TIMING_CONFIG)); } - drive->waiting_for_dma = 1; - return 0; } @@ -1553,7 +1551,6 @@ pmac_ide_dma_end (ide_drive_t *drive) volatile struct dbdma_regs __iomem *dma = pmif->dma_regs; u32 dstat; - drive->waiting_for_dma = 0; dstat = readl(&dma->status); writel(((RUN|WAKE|DEAD) << 16), &dma->control); diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index d9c47034bedd..13e3988f00f0 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -183,8 +183,6 @@ static int sc1200_dma_end(ide_drive_t *drive) outb(dma_stat|0x1b, dma_base+2); /* clear the INTR & ERROR bits */ outb(inb(dma_base)&~1, dma_base); /* !! DO THIS HERE !! stop DMA */ - drive->waiting_for_dma = 0; - return (dma_stat & 7) != 4; /* verify good DMA status */ } diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 2cdf51d8917a..542419ffa9b7 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -335,7 +335,7 @@ static int scc_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) /* clear INTR & ERROR flags */ out_be32((void __iomem *)(hwif->dma_base + 4), dma_stat | 6); - drive->waiting_for_dma = 1; + return 0; } @@ -354,7 +354,6 @@ static int __scc_dma_end(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; u8 dma_stat, dma_cmd; - drive->waiting_for_dma = 0; /* get DMA command mode */ dma_cmd = scc_ide_inb(hwif->dma_base); /* stop DMA */ diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index d90e8c5af0a2..cb2657c4c976 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -258,8 +258,6 @@ static int sgiioc4_dma_end(ide_drive_t *drive) } } - drive->waiting_for_dma = 0; - return dma_stat; } @@ -412,7 +410,6 @@ sgiioc4_configure_for_dma(int dma_direction, ide_drive_t * drive) writel(ending_dma_addr, (void __iomem *)(dma_base + IOC4_DMA_END_ADDR * 4)); writel(dma_direction, (void __iomem *)ioc4_dma_addr); - drive->waiting_for_dma = 1; } /* IOC4 Scatter Gather list Format */ diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c index 0be27cae27d5..c0528f27fcae 100644 --- a/drivers/ide/trm290.c +++ b/drivers/ide/trm290.c @@ -198,9 +198,9 @@ static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) return 1; outl(hwif->dmatable_dma | rw, hwif->dma_base); - drive->waiting_for_dma = 1; /* start DMA */ outw(count * 2 - 1, hwif->dma_base + 2); + return 0; } @@ -211,11 +211,7 @@ static void trm290_dma_start(ide_drive_t *drive) static int trm290_dma_end(ide_drive_t *drive) { - u16 status; - - drive->waiting_for_dma = 0; - - status = inw(drive->hwif->dma_base + 2); + u16 status = inw(drive->hwif->dma_base + 2); trm290_prepare_drive(drive, 0); diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 7267c46c07cd..faf0d97b9ce8 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -304,8 +304,6 @@ static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) /* clear INTR & ERROR flags */ tx4939ide_clear_dma_status(base); - drive->waiting_for_dma = 1; - tx4939ide_writew(SECTOR_SIZE / 2, base, drive->dn ? TX4939IDE_Xfer_Cnt_2 : TX4939IDE_Xfer_Cnt_1); @@ -321,8 +319,6 @@ static int tx4939ide_dma_end(ide_drive_t *drive) void __iomem *base = TX4939IDE_BASE(hwif); u16 ctl = tx4939ide_readw(base, TX4939IDE_Int_Ctl); - drive->waiting_for_dma = 0; - /* get DMA command mode */ dma_cmd = tx4939ide_readb(base, TX4939IDE_DMA_Cmd); /* stop DMA */ From f094d4d83bccee9277ddb6aadccf35747889426b Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:24 +0200 Subject: [PATCH 265/478] ide: sanitize ide_build_sglist() and ide_destroy_dmatable() * Move ide_map_sg() calls out from ide_build_sglist() to ide_dma_prepare(). * Pass command to ide_destroy_dmatable(). * Rename ide_build_sglist() to ide_dma_map_sg() and ide_destroy_dmatable() to ide_dma_unmap_sg(). There should be no functional changes caused by this patch. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 3 ++- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-dma.c | 50 ++++++++++++++++++++--------------------- drivers/ide/sgiioc4.c | 7 +++--- include/linux/ide.h | 6 ++--- 5 files changed, 35 insertions(+), 33 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index db6e617790bd..3f3fc7c7b2fe 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -326,6 +326,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) { struct ide_atapi_pc *pc = drive->pc; ide_hwif_t *hwif = drive->hwif; + struct ide_cmd *cmd = &hwif->cmd; struct request *rq = hwif->rq; const struct ide_tp_ops *tp_ops = hwif->tp_ops; xfer_func_t *xferfunc; @@ -346,7 +347,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) drive->waiting_for_dma = 0; rc = hwif->dma_ops->dma_end(drive); - ide_destroy_dmatable(drive); + ide_dma_unmap_sg(drive, cmd); if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) { if (drive->media == ide_floppy) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index f5c7bb739f45..35729a47f797 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -640,7 +640,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) drive->dma = 0; drive->waiting_for_dma = 0; dma_error = hwif->dma_ops->dma_end(drive); - ide_destroy_dmatable(drive); + ide_dma_unmap_sg(drive, cmd); if (dma_error) { printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name, write ? "write" : "read"); diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 4d3102887d9c..a5612eadc306 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -89,17 +89,16 @@ static const struct drive_list_entry drive_blacklist[] = { ide_startstop_t ide_dma_intr(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; + struct ide_cmd *cmd = &hwif->cmd; u8 stat = 0, dma_stat = 0; drive->waiting_for_dma = 0; dma_stat = hwif->dma_ops->dma_end(drive); - ide_destroy_dmatable(drive); + ide_dma_unmap_sg(drive, cmd); stat = hwif->tp_ops->read_status(hwif); if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) { if (!dma_stat) { - struct ide_cmd *cmd = &hwif->cmd; - if ((cmd->tf_flags & IDE_TFLAG_FS) == 0) ide_finish_cmd(drive, cmd, stat); else @@ -119,8 +118,8 @@ int ide_dma_good_drive(ide_drive_t *drive) } /** - * ide_build_sglist - map IDE scatter gather for DMA I/O - * @drive: the drive to build the DMA table for + * ide_dma_map_sg - map IDE scatter gather for DMA I/O + * @drive: the drive to map the DMA table for * @cmd: command * * Perform the DMA mapping magic necessary to access the source or @@ -129,23 +128,19 @@ int ide_dma_good_drive(ide_drive_t *drive) * operate in a portable fashion. */ -static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd) +static int ide_dma_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; struct scatterlist *sg = hwif->sg_table; int i; - ide_map_sg(drive, cmd); - if (cmd->tf_flags & IDE_TFLAG_WRITE) cmd->sg_dma_direction = DMA_TO_DEVICE; else cmd->sg_dma_direction = DMA_FROM_DEVICE; i = dma_map_sg(hwif->dev, sg, cmd->sg_nents, cmd->sg_dma_direction); - if (i == 0) - ide_map_sg(drive, cmd); - else { + if (i) { cmd->orig_sg_nents = cmd->sg_nents; cmd->sg_nents = i; } @@ -154,7 +149,7 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd) } /** - * ide_destroy_dmatable - clean up DMA mapping + * ide_dma_unmap_sg - clean up DMA mapping * @drive: The drive to unmap * * Teardown mappings after DMA has completed. This must be called @@ -164,15 +159,14 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd) * time. */ -void ide_destroy_dmatable(ide_drive_t *drive) +void ide_dma_unmap_sg(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; - struct ide_cmd *cmd = &hwif->cmd; dma_unmap_sg(hwif->dev, hwif->sg_table, cmd->orig_sg_nents, cmd->sg_dma_direction); } -EXPORT_SYMBOL_GPL(ide_destroy_dmatable); +EXPORT_SYMBOL_GPL(ide_dma_unmap_sg); /** * ide_dma_off_quietly - Generic DMA kill @@ -471,6 +465,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { ide_hwif_t *hwif = drive->hwif; const struct ide_dma_ops *dma_ops = hwif->dma_ops; + struct ide_cmd *cmd = &hwif->cmd; struct request *rq; ide_startstop_t ret = ide_stopped; @@ -482,7 +477,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) printk(KERN_WARNING "%s: DMA timeout error\n", drive->name); drive->waiting_for_dma = 0; (void)dma_ops->dma_end(drive); - ide_destroy_dmatable(drive); + ide_dma_unmap_sg(drive, cmd); ret = ide_error(drive, "dma timeout error", hwif->tp_ops->read_status(hwif)); } else { @@ -495,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) hwif->tp_ops->read_status(hwif)); drive->waiting_for_dma = 0; (void)dma_ops->dma_end(drive); - ide_destroy_dmatable(drive); + ide_dma_unmap_sg(drive, cmd); } } @@ -572,14 +567,19 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd) const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops; if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 || - (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) || - ide_build_sglist(drive, cmd) == 0) - return 1; - if (dma_ops->dma_setup(drive, cmd)) { - ide_destroy_dmatable(drive); - ide_map_sg(drive, cmd); - return 1; - } + (dma_ops->dma_check && dma_ops->dma_check(drive, cmd))) + goto out; + ide_map_sg(drive, cmd); + if (ide_dma_map_sg(drive, cmd) == 0) + goto out_map; + if (dma_ops->dma_setup(drive, cmd)) + goto out_dma_unmap; drive->waiting_for_dma = 1; return 0; +out_dma_unmap: + ide_dma_unmap_sg(drive, cmd); +out_map: + ide_map_sg(drive, cmd); +out: + return 1; } diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index cb2657c4c976..6ef5a567d377 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -277,11 +277,12 @@ static void sgiioc4_dma_host_set(ide_drive_t *drive, int on) sgiioc4_clearirq(drive); } -static void -sgiioc4_resetproc(ide_drive_t * drive) +static void sgiioc4_resetproc(ide_drive_t *drive) { + struct ide_cmd *cmd = &drive->hwif->cmd; + sgiioc4_dma_end(drive); - ide_destroy_dmatable(drive); + ide_dma_unmap_sg(drive, cmd); sgiioc4_clearirq(drive); } diff --git a/include/linux/ide.h b/include/linux/ide.h index b350667b83ad..03c520917b7a 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1445,8 +1445,7 @@ int ide_allocate_dma_engine(ide_hwif_t *); void ide_release_dma_engine(ide_hwif_t *); int ide_dma_prepare(ide_drive_t *, struct ide_cmd *); - -void ide_destroy_dmatable(ide_drive_t *); +void ide_dma_unmap_sg(ide_drive_t *, struct ide_cmd *); #ifdef CONFIG_BLK_DEV_IDEDMA_SFF int config_drive_for_dma(ide_drive_t *); @@ -1481,7 +1480,8 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; } static inline int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd) { return 1; } -static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; } +static inline void ide_dma_unmap_sg(ide_drive_t *drive, + struct ide_cmd *cmd) { ; } #endif /* CONFIG_BLK_DEV_IDEDMA */ #ifdef CONFIG_BLK_DEV_IDEACPI From 52913ab2c6f760c2af9f9396765ce8fa1a2baf17 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:24 +0200 Subject: [PATCH 266/478] ide-generic: remove no longer needed sysfs interface Nowadays we have "ide_generic.probe_mask=" module parameter and ide_platform host driver so sysfs interface for adding IDE interfaces is no longer needed. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-generic.c | 68 +-------------------------------------- 1 file changed, 1 insertion(+), 67 deletions(-) diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c index 9d03e8211536..3a93e4c41bf7 100644 --- a/drivers/ide/ide-generic.c +++ b/drivers/ide/ide-generic.c @@ -1,20 +1,12 @@ /* * generic/default IDE host driver * - * Copyright (C) 2004, 2008 Bartlomiej Zolnierkiewicz + * Copyright (C) 2004, 2008-2009 Bartlomiej Zolnierkiewicz * This code was split off from ide.c. See it for original copyrights. * * May be copied or modified under the terms of the GNU General Public License. */ -/* - * For special cases new interfaces may be added using sysfs, i.e. - * - * echo -n "0x168:0x36e:10" > /sys/class/ide_generic/add - * - * will add an interface using I/O ports 0x168-0x16f/0x36e and IRQ 10. - */ - #include #include #include @@ -36,60 +28,6 @@ static const struct ide_port_info ide_generic_port_info = { .host_flags = IDE_HFLAG_NO_DMA, }; -static ssize_t store_add(struct class *cls, const char *buf, size_t n) -{ - unsigned int base, ctl; - int irq, rc; - hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; - - if (sscanf(buf, "%x:%x:%d", &base, &ctl, &irq) != 3) - return -EINVAL; - - memset(&hw, 0, sizeof(hw)); - ide_std_init_ports(&hw, base, ctl); - hw.irq = irq; - hw.chipset = ide_generic; - - rc = ide_host_add(&ide_generic_port_info, hws, NULL); - if (rc) - return rc; - - return n; -}; - -static struct class_attribute ide_generic_class_attrs[] = { - __ATTR(add, S_IWUSR, NULL, store_add), - __ATTR_NULL -}; - -static void ide_generic_class_release(struct class *cls) -{ - kfree(cls); -} - -static int __init ide_generic_sysfs_init(void) -{ - struct class *cls; - int rc; - - cls = kzalloc(sizeof(*cls), GFP_KERNEL); - if (!cls) - return -ENOMEM; - - cls->name = DRV_NAME; - cls->owner = THIS_MODULE; - cls->class_release = ide_generic_class_release; - cls->class_attrs = ide_generic_class_attrs; - - rc = class_register(cls); - if (rc) { - kfree(cls); - return rc; - } - - return 0; -} - #if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) \ || defined(CONFIG_PLAT_OPSPUT) static const u16 legacy_bases[] = { 0x1f0 }; @@ -196,10 +134,6 @@ static int __init ide_generic_init(void) } } - if (ide_generic_sysfs_init()) - printk(KERN_ERR DRV_NAME ": failed to create ide_generic " - "class\n"); - return rc; } From 4465461ece2b9249d6c0cf57bc0002100823e361 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:24 +0200 Subject: [PATCH 267/478] ide: merge ide_arm and ide_generic host drivers There is no need for a separate ide_arm host driver nowadays so merge it into ide_generic one. While at it: - return -EBUSY from ide_generic_init() if I/O resources are busy - scale down ide_generic_check_pci_legacy_iobases() for CONFIG_PCI=n Cc: Russell King Cc: Alexander Schulz Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/Kconfig | 8 ++---- drivers/ide/Makefile | 2 -- drivers/ide/ide-generic.c | 18 +++++++++---- drivers/ide/ide_arm.c | 53 --------------------------------------- 4 files changed, 15 insertions(+), 66 deletions(-) delete mode 100644 drivers/ide/ide_arm.c diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 640c99207242..896424445f65 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -222,7 +222,8 @@ comment "IDE chipset support/bugfixes" config IDE_GENERIC tristate "generic/default IDE chipset support" - depends on ALPHA || X86 || IA64 || M32R || MIPS + depends on ALPHA || X86 || IA64 || M32R || MIPS || ARCH_RPC || ARCH_SHARK + default ARM && (ARCH_RPC || ARCH_SHARK) help This is the generic IDE driver. This driver attaches to the fixed legacy ports (e.g. on PCs 0x1f0/0x170, 0x1e8/0x168 and @@ -731,11 +732,6 @@ config BLK_DEV_IDE_AT91 depends on ARM && ARCH_AT91 && !ARCH_AT91RM9200 && !ARCH_AT91X40 select IDE_TIMINGS -config IDE_ARM - tristate "ARM IDE support" - depends on ARM && (ARCH_RPC || ARCH_SHARK) - default y - config BLK_DEV_IDE_ICSIDE tristate "ICS IDE interface support" depends on ARM && ARCH_ACORN diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile index 9b4bbe1cdc1a..81df925f0e8b 100644 --- a/drivers/ide/Makefile +++ b/drivers/ide/Makefile @@ -21,8 +21,6 @@ ide-core-$(CONFIG_IDE_LEGACY) += ide-legacy.o obj-$(CONFIG_IDE) += ide-core.o -obj-$(CONFIG_IDE_ARM) += ide_arm.o - obj-$(CONFIG_BLK_DEV_ALI14XX) += ali14xx.o obj-$(CONFIG_BLK_DEV_UMC8672) += umc8672.o obj-$(CONFIG_BLK_DEV_DTC2278) += dtc2278.o diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c index 3a93e4c41bf7..7812ca0be13b 100644 --- a/drivers/ide/ide-generic.c +++ b/drivers/ide/ide-generic.c @@ -13,7 +13,10 @@ #include #include -/* FIXME: convert m32r to use ide_platform host driver */ +/* FIXME: convert arm and m32r to use ide_platform host driver */ +#ifdef CONFIG_ARM +#include +#endif #ifdef CONFIG_M32R #include #endif @@ -28,8 +31,11 @@ static const struct ide_port_info ide_generic_port_info = { .host_flags = IDE_HFLAG_NO_DMA, }; -#if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) \ - || defined(CONFIG_PLAT_OPSPUT) +#ifdef CONFIG_ARM +static const u16 legacy_bases[] = { 0x1f0 }; +static const int legacy_irqs[] = { IRQ_HARDDISK }; +#elif defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) || \ + defined(CONFIG_PLAT_OPSPUT) static const u16 legacy_bases[] = { 0x1f0 }; static const int legacy_irqs[] = { PLD_IRQ_CFIREQ }; #elif defined(CONFIG_PLAT_MAPPI3) @@ -45,11 +51,11 @@ static const int legacy_irqs[] = { 14, 15, 11, 10, 8, 12 }; static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary) { +#ifdef CONFIG_PCI struct pci_dev *p = NULL; u16 val; for_each_pci_dev(p) { - if (pci_resource_start(p, 0) == 0x1f0) *primary = 1; if (pci_resource_start(p, 2) == 0x170) @@ -64,7 +70,6 @@ static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary) /* Intel MPIIX - PIO ATA on non PCI side of bridge */ if (p->vendor == PCI_VENDOR_ID_INTEL && p->device == PCI_DEVICE_ID_INTEL_82371MX) { - pci_read_config_word(p, 0x6C, &val); if (val & 0x8000) { /* ATA port enabled */ @@ -75,6 +80,7 @@ static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary) } } } +#endif } static int __init ide_generic_init(void) @@ -106,6 +112,7 @@ static int __init ide_generic_init(void) printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX " "not free.\n", DRV_NAME, io_addr, io_addr + 7); + rc = -EBUSY; continue; } @@ -114,6 +121,7 @@ static int __init ide_generic_init(void) "not free.\n", DRV_NAME, io_addr + 0x206); release_region(io_addr, 8); + rc = -EBUSY; continue; } diff --git a/drivers/ide/ide_arm.c b/drivers/ide/ide_arm.c deleted file mode 100644 index cf6385446ece..000000000000 --- a/drivers/ide/ide_arm.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * ARM default IDE host driver - * - * Copyright (C) 2004 Bartlomiej Zolnierkiewicz - * Based on code by: Russell King, Ian Molton and Alexander Schulz. - * - * May be copied or modified under the terms of the GNU General Public License. - */ - -#include -#include -#include - -#include - -#define DRV_NAME "ide_arm" - -#define IDE_ARM_IO 0x1f0 -#define IDE_ARM_IRQ IRQ_HARDDISK - -static const struct ide_port_info ide_arm_port_info = { - .host_flags = IDE_HFLAG_NO_DMA, -}; - -static int __init ide_arm_init(void) -{ - unsigned long base = IDE_ARM_IO, ctl = IDE_ARM_IO + 0x206; - hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; - - if (!request_region(base, 8, DRV_NAME)) { - printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n", - DRV_NAME, base, base + 7); - return -EBUSY; - } - - if (!request_region(ctl, 1, DRV_NAME)) { - printk(KERN_ERR "%s: I/O resource 0x%lX not free.\n", - DRV_NAME, ctl); - release_region(base, 8); - return -EBUSY; - } - - memset(&hw, 0, sizeof(hw)); - ide_std_init_ports(&hw, base, ctl); - hw.irq = IDE_ARM_IRQ; - hw.chipset = ide_generic; - - return ide_host_add(&ide_arm_port_info, hws, NULL); -} - -module_init(ide_arm_init); - -MODULE_LICENSE("GPL"); From b5479167f4206e0d821a51ae149d921cd7a58e54 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:25 +0200 Subject: [PATCH 268/478] ide: fix locking in drive_release_dev() * Request queue cleanup should happen before freeing drive->id and marking device as non-present. Fix it. * Remove superfluous hwif->lock acquiring/releasing. Cc: Stanislaw Gruszka Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-probe.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 548864510ba9..7c1f1bf81836 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -942,20 +942,16 @@ EXPORT_SYMBOL_GPL(ide_init_disk); static void drive_release_dev (struct device *dev) { ide_drive_t *drive = container_of(dev, ide_drive_t, gendev); - ide_hwif_t *hwif = drive->hwif; ide_proc_unregister_device(drive); - spin_lock_irq(&hwif->lock); + blk_cleanup_queue(drive->queue); + drive->queue = NULL; + kfree(drive->id); drive->id = NULL; + drive->dev_flags &= ~IDE_DFLAG_PRESENT; - /* Messed up locking ... */ - spin_unlock_irq(&hwif->lock); - blk_cleanup_queue(drive->queue); - spin_lock_irq(&hwif->lock); - drive->queue = NULL; - spin_unlock_irq(&hwif->lock); complete(&drive->gendev_rel_comp); } From 41fa9f863baacd32dd049daf8050d55a0c9e6f1a Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:25 +0200 Subject: [PATCH 269/478] ide: decrease size of ->pc_buf field in struct ide_atapi_pc struct ide_atapi_pc is often allocated on the stack and size of ->pc_buf size is 256 bytes. However since only ide_floppy_create_read_capacity_cmd() and idetape_create_inquiry_cmd() require such size allocate buffers for these pc-s explicitely and decrease ->pc_buf size to 64 bytes. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-floppy.c | 5 ++++- drivers/ide/ide-floppy_ioctl.c | 5 ++++- drivers/ide/ide-tape.c | 4 ++++ include/linux/ide.h | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 7ae662334835..0faae3098295 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -385,7 +385,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive) struct gendisk *disk = floppy->disk; struct ide_atapi_pc pc; u8 *cap_desc; - u8 header_len, desc_cnt; + u8 pc_buf[256], header_len, desc_cnt; int i, rc = 1, blocks, length; drive->bios_cyl = 0; @@ -395,6 +395,9 @@ static int ide_floppy_get_capacity(ide_drive_t *drive) drive->capacity64 = 0; ide_floppy_create_read_capacity_cmd(&pc); + pc.buf = &pc_buf[0]; + pc.buf_size = sizeof(pc_buf); + if (ide_queue_pc_tail(drive, disk, &pc)) { printk(KERN_ERR PFX "Can't get floppy parameters\n"); return 1; diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c index 8f8be8546038..cd8a42027ede 100644 --- a/drivers/ide/ide-floppy_ioctl.c +++ b/drivers/ide/ide-floppy_ioctl.c @@ -36,9 +36,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive, int __user *arg) { struct ide_disk_obj *floppy = drive->driver_data; - u8 header_len, desc_cnt; int i, blocks, length, u_array_size, u_index; int __user *argp; + u8 pc_buf[256], header_len, desc_cnt; if (get_user(u_array_size, arg)) return -EFAULT; @@ -47,6 +47,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive, return -EINVAL; ide_floppy_create_read_capacity_cmd(pc); + pc->buf = &pc_buf[0]; + pc->buf_size = sizeof(pc_buf); + if (ide_queue_pc_tail(drive, floppy->disk, pc)) { printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n"); return -EIO; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 64dfa7458f8d..cafc67d9e2e8 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -2014,9 +2014,13 @@ static void idetape_get_inquiry_results(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; struct ide_atapi_pc pc; + u8 pc_buf[256]; char fw_rev[4], vendor_id[8], product_id[16]; idetape_create_inquiry_cmd(&pc); + pc.buf = &pc_buf[0]; + pc.buf_size = sizeof(pc_buf); + if (ide_queue_pc_tail(drive, tape->disk, &pc)) { printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n", tape->name); diff --git a/include/linux/ide.h b/include/linux/ide.h index 03c520917b7a..0f48fbd46028 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -377,7 +377,7 @@ enum { * With each packet command, we allocate a buffer of IDE_PC_BUFFER_SIZE bytes. * This is used for several packet commands (not for READ/WRITE commands). */ -#define IDE_PC_BUFFER_SIZE 256 +#define IDE_PC_BUFFER_SIZE 64 #define ATAPI_WAIT_PC (60 * HZ) struct ide_atapi_pc { From 9f5af4d667a6d4ebd66019b4b26b445ddbae6d6c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:26 +0200 Subject: [PATCH 270/478] ide: remove CONFIG_BLK_DEV_IDEDOUBLER config option Nowadays it is not worth having a separate config option for Amiga IDE Doubler support so always include it (it still needs to be explicitly enabled by module parameter). Cc: Geert Uytterhoeven Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/Kconfig | 25 +++++++++---------------- drivers/ide/gayle.c | 12 +----------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 896424445f65..4d2f2a6b5bdb 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -770,28 +770,21 @@ config BLK_DEV_GAYLE This includes on-board IDE interfaces on some Amiga models (A600, A1200, A4000, and A4000T), and IDE interfaces on the Zorro expansion bus (M-Tech E-Matrix 530 expansion card). - Say Y if you have an Amiga with a Gayle IDE interface and want to use - IDE devices (hard disks, CD-ROM drives, etc.) that are connected to - it. - Note that you also have to enable Zorro bus support if you want to - use Gayle IDE interfaces on the Zorro expansion bus. -config BLK_DEV_IDEDOUBLER - bool "Amiga IDE Doubler support (EXPERIMENTAL)" - depends on BLK_DEV_GAYLE && EXPERIMENTAL - ---help--- - This feature provides support for the so-called `IDE doublers' (made + It also provides support for the so-called `IDE doublers' (made by various manufacturers, e.g. Eyetech) that can be connected to the on-board IDE interface of some Amiga models. Using such an IDE doubler, you can connect up to four instead of two IDE devices to - the Amiga's on-board IDE interface. - - Note that the normal Amiga Gayle IDE driver may not work correctly - if you have an IDE doubler and don't enable this feature! - - Say Y if you have an IDE doubler. The feature is enabled at kernel + the Amiga's on-board IDE interface. The feature is enabled at kernel runtime using the "gayle.doubler" kernel boot parameter. + Say Y if you have an Amiga with a Gayle IDE interface and want to use + IDE devices (hard disks, CD-ROM drives, etc.) that are connected to + it. + + Note that you also have to enable Zorro bus support if you want to + use Gayle IDE interfaces on the Zorro expansion bus. + config BLK_DEV_BUDDHA tristate "Buddha/Catweasel/X-Surf IDE interface support (EXPERIMENTAL)" depends on ZORRO && EXPERIMENTAL diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c index dc778251cb05..c7119516c5a7 100644 --- a/drivers/ide/gayle.c +++ b/drivers/ide/gayle.c @@ -53,11 +53,6 @@ #define GAYLE_NEXT_PORT 0x1000 -#ifndef CONFIG_BLK_DEV_IDEDOUBLER -#define GAYLE_NUM_HWIFS 1 -#define GAYLE_NUM_PROBE_HWIFS GAYLE_NUM_HWIFS -#define GAYLE_HAS_CONTROL_REG 1 -#else /* CONFIG_BLK_DEV_IDEDOUBLER */ #define GAYLE_NUM_HWIFS 2 #define GAYLE_NUM_PROBE_HWIFS (ide_doubler ? GAYLE_NUM_HWIFS : \ GAYLE_NUM_HWIFS-1) @@ -66,8 +61,6 @@ static int ide_doubler; module_param_named(doubler, ide_doubler, bool, 0); MODULE_PARM_DESC(doubler, "enable support for IDE doublers"); -#endif /* CONFIG_BLK_DEV_IDEDOUBLER */ - /* * Check and acknowledge the interrupt status @@ -151,10 +144,7 @@ static int __init gayle_init(void) found: printk(KERN_INFO "ide: Gayle IDE controller (A%d style%s)\n", a4000 ? 4000 : 1200, -#ifdef CONFIG_BLK_DEV_IDEDOUBLER - ide_doubler ? ", IDE doubler" : -#endif - ""); + ide_doubler ? ", IDE doubler" : ""); if (a4000) { phys_base = GAYLE_BASE_4000; From d93bc4521c80e9d87767779814e88f6d725453d7 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:26 +0200 Subject: [PATCH 271/478] ide-{floppy,tape}: fix padding for PIO transfers * Return number of bytes left to transfer from idetape_{in,out}put_buffers() and number of bytes done from ide_tape_io_buffers(). * Fix padding for PIO transfers in ide_pc_intr() so read/write buffers are always completely processed and then the transfer is padded if necessary. * Remove invalid error messages. * Remove now superfluous padding from ide{_io_buffers,tape_input_buffers}(). While at it: * Set pc->bh to NULL in idetape_input_buffers() after all bh-s are done. * Cache !!(pc->flags & PC_FLAG_WRITING) in local variable in ide_pc_intr(). Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 57 ++++++++++++++--------------------------- drivers/ide/ide-tape.c | 36 +++++++++++++------------- 2 files changed, 37 insertions(+), 56 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 3f3fc7c7b2fe..40f413b20bd8 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -110,13 +110,6 @@ int ide_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, } } - if (bcount) { - printk(KERN_ERR "%s: %d leftover bytes, %s\n", drive->name, - bcount, write ? "padding with zeros" - : "discarding data"); - ide_pad_transfer(drive, write, bcount); - } - return done; } EXPORT_SYMBOL_GPL(ide_io_buffers); @@ -330,9 +323,10 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) struct request *rq = hwif->rq; const struct ide_tp_ops *tp_ops = hwif->tp_ops; xfer_func_t *xferfunc; - unsigned int timeout, temp; + unsigned int timeout, done; u16 bcount; u8 stat, ireason, dsc = 0; + u8 write = !!(pc->flags & PC_FLAG_WRITING); debug_log("Enter %s - interrupt handler\n", __func__); @@ -441,8 +435,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - if (((ireason & ATAPI_IO) == ATAPI_IO) == - !!(pc->flags & PC_FLAG_WRITING)) { + if (((ireason & ATAPI_IO) == ATAPI_IO) == write) { /* Hopefully, we will never get here */ printk(KERN_ERR "%s: We wanted to %s, but the device wants us " "to %s!\n", drive->name, @@ -451,45 +444,33 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - if (!(pc->flags & PC_FLAG_WRITING)) { - /* Reading - Check that we have enough space */ - temp = pc->xferred + bcount; - if (temp > pc->req_xfer) { - if (temp > pc->buf_size) { - printk(KERN_ERR "%s: The device wants to send " - "us more data than expected - " - "discarding data\n", - drive->name); - - ide_pad_transfer(drive, 0, bcount); - goto next_irq; - } - debug_log("The device wants to send us more data than " - "expected - allowing transfer\n"); - } - xferfunc = tp_ops->input_data; - } else - xferfunc = tp_ops->output_data; + xferfunc = write ? tp_ops->output_data : tp_ops->input_data; if ((drive->media == ide_floppy && !pc->buf) || (drive->media == ide_tape && pc->bh)) { - int done = drive->pc_io_buffers(drive, pc, bcount, - !!(pc->flags & PC_FLAG_WRITING)); + done = drive->pc_io_buffers(drive, pc, bcount, write); /* FIXME: don't do partial completions */ if (drive->media == ide_floppy) ide_complete_rq(drive, 0, done ? done : ide_rq_bytes(rq)); - } else - xferfunc(drive, NULL, pc->cur_pos, bcount); + } else { + done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred); + xferfunc(drive, NULL, pc->cur_pos, done); + } /* Update the current position */ - pc->xferred += bcount; - pc->cur_pos += bcount; + pc->xferred += done; + pc->cur_pos += done; + + bcount -= done; + + if (bcount) + ide_pad_transfer(drive, write, bcount); + + debug_log("[cmd %x] transferred %d bytes, padded %d bytes\n", + rq->cmd[0], done, bcount); - debug_log("[cmd %x] transferred %d bytes on that intr.\n", - rq->cmd[0], bcount); -next_irq: /* And set the interrupt handler again */ ide_set_handler(drive, ide_pc_intr, timeout); return ide_started; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index cafc67d9e2e8..cb942a9b580f 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -297,19 +297,15 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i) return tape; } -static void idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, +static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, unsigned int bcount) { struct idetape_bh *bh = pc->bh; int count; while (bcount) { - if (bh == NULL) { - printk(KERN_ERR "ide-tape: bh == NULL in " - "idetape_input_buffers\n"); - ide_pad_transfer(drive, 0, bcount); - return; - } + if (bh == NULL) + break; count = min( (unsigned int)(bh->b_size - atomic_read(&bh->b_count)), bcount); @@ -323,21 +319,21 @@ static void idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, atomic_set(&bh->b_count, 0); } } + pc->bh = bh; + + return bcount; } -static void idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, +static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, unsigned int bcount) { struct idetape_bh *bh = pc->bh; int count; while (bcount) { - if (bh == NULL) { - printk(KERN_ERR "ide-tape: bh == NULL in %s\n", - __func__); - return; - } + if (bh == NULL) + break; count = min((unsigned int)pc->b_count, (unsigned int)bcount); drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count); bcount -= count; @@ -352,6 +348,8 @@ static void idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, } } } + + return bcount; } static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc) @@ -563,12 +561,14 @@ static void ide_tape_handle_dsc(ide_drive_t *drive) static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, unsigned int bcount, int write) { - if (write) - idetape_output_buffers(drive, pc, bcount); - else - idetape_input_buffers(drive, pc, bcount); + unsigned int bleft; - return bcount; + if (write) + bleft = idetape_output_buffers(drive, pc, bcount); + else + bleft = idetape_input_buffers(drive, pc, bcount); + + return bcount - bleft; } /* From 349d12a1fe57d49287a539909cf14f362634342d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:26 +0200 Subject: [PATCH 272/478] ide-floppy: use ide_pio_bytes() * Fix ide_init_sg_cmd() setup for non-fs requests. * Convert ide_pc_intr() to use ide_pio_bytes() for floppy media. * Remove no longer needed ide_io_buffers() and sg/sg_cnt fields from struct ide_atapi_pc. * Remove partial completions; kill idefloppy_update_buffers(), as a result. * Add some more debugging statements. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 68 ++++++++++------------------------------ drivers/ide/ide-floppy.c | 24 +++----------- include/linux/ide.h | 5 --- 3 files changed, 21 insertions(+), 76 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 40f413b20bd8..100e6f94b4f0 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -71,49 +71,6 @@ int ide_check_atapi_device(ide_drive_t *drive, const char *s) } EXPORT_SYMBOL_GPL(ide_check_atapi_device); -/* PIO data transfer routine using the scatter gather table. */ -int ide_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, - unsigned int bcount, int write) -{ - ide_hwif_t *hwif = drive->hwif; - const struct ide_tp_ops *tp_ops = hwif->tp_ops; - xfer_func_t *xf = write ? tp_ops->output_data : tp_ops->input_data; - struct scatterlist *sg = pc->sg; - char *buf; - int count, done = 0; - - while (bcount) { - count = min(sg->length - pc->b_count, bcount); - - if (PageHighMem(sg_page(sg))) { - unsigned long flags; - - local_irq_save(flags); - buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset; - xf(drive, NULL, buf + pc->b_count, count); - kunmap_atomic(buf - sg->offset, KM_IRQ0); - local_irq_restore(flags); - } else { - buf = sg_virt(sg); - xf(drive, NULL, buf + pc->b_count, count); - } - - bcount -= count; - pc->b_count += count; - done += count; - - if (pc->b_count == sg->length) { - if (!--pc->sg_cnt) - break; - pc->sg = sg = sg_next(sg); - pc->b_count = 0; - } - } - - return done; -} -EXPORT_SYMBOL_GPL(ide_io_buffers); - void ide_init_pc(struct ide_atapi_pc *pc) { memset(pc, 0, sizeof(*pc)); @@ -353,6 +310,9 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) pc->xferred = pc->req_xfer; if (drive->pc_update_buffers) drive->pc_update_buffers(drive, pc); + + if (drive->media == ide_floppy) + ide_complete_rq(drive, 0, blk_rq_bytes(rq)); } debug_log("%s: DMA finished\n", drive->name); } @@ -408,12 +368,19 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) rq->errors = 0; ide_complete_rq(drive, 0, blk_rq_bytes(rq)); } else { + unsigned int done; + if (blk_fs_request(rq) == 0 && uptodate <= 0) { if (rq->errors == 0) rq->errors = -EIO; } - ide_complete_rq(drive, uptodate ? 0 : -EIO, - ide_rq_bytes(rq)); + + if (drive->media == ide_tape) + done = ide_rq_bytes(rq); /* FIXME */ + else + done = blk_rq_bytes(rq); + + ide_complete_rq(drive, uptodate ? 0 : -EIO, done); } return ide_stopped; @@ -446,14 +413,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) xferfunc = write ? tp_ops->output_data : tp_ops->input_data; - if ((drive->media == ide_floppy && !pc->buf) || - (drive->media == ide_tape && pc->bh)) { + if (drive->media == ide_floppy && pc->buf == NULL) { + done = min_t(unsigned int, bcount, cmd->nleft); + ide_pio_bytes(drive, cmd, write, done); + } else if (drive->media == ide_tape && pc->bh) { done = drive->pc_io_buffers(drive, pc, bcount, write); - - /* FIXME: don't do partial completions */ - if (drive->media == ide_floppy) - ide_complete_rq(drive, 0, - done ? done : ide_rq_bytes(rq)); } else { done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred); xferfunc(drive, NULL, pc->cur_pos, done); diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 0faae3098295..2b4868d95f8b 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -61,16 +61,6 @@ */ #define IDEFLOPPY_PC_DELAY (HZ/20) /* default delay for ZIP 100 (50ms) */ -static void idefloppy_update_buffers(ide_drive_t *drive, - struct ide_atapi_pc *pc) -{ - struct request *rq = pc->rq; - struct bio *bio = rq->bio; - - while ((bio = rq->bio) != NULL) - ide_complete_rq(drive, 0, ide_rq_bytes(rq)); -} - static int ide_floppy_callback(ide_drive_t *drive, int dsc) { struct ide_disk_obj *floppy = drive->driver_data; @@ -213,7 +203,6 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive, memcpy(rq->cmd, pc->c, 12); pc->rq = rq; - pc->b_count = 0; if (rq->cmd_flags & REQ_RW) pc->flags |= PC_FLAG_WRITING; pc->buf = NULL; @@ -227,7 +216,6 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy, ide_init_pc(pc); memcpy(pc->c, rq->cmd, sizeof(pc->c)); pc->rq = rq; - pc->b_count = 0; if (rq->data_len && rq_data_dir(rq) == WRITE) pc->flags |= PC_FLAG_WRITING; pc->buf = rq->data; @@ -244,10 +232,11 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, struct request *rq, sector_t block) { struct ide_disk_obj *floppy = drive->driver_data; - ide_hwif_t *hwif = drive->hwif; struct ide_cmd cmd; struct ide_atapi_pc *pc; + ide_debug_log(IDE_DBG_FUNC, "enter, cmd: 0x%x\n", rq->cmd[0]); + if (drive->debug_mask & IDE_DBG_RQ) blk_dump_rq_flags(rq, (rq->rq_disk ? rq->rq_disk->disk_name @@ -294,13 +283,10 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, cmd.rq = rq; if (blk_fs_request(rq) || pc->req_xfer) { - ide_init_sg_cmd(&cmd, rq->nr_sectors << 9); + ide_init_sg_cmd(&cmd, pc->req_xfer); ide_map_sg(drive, &cmd); } - pc->sg = hwif->sg_table; - pc->sg_cnt = cmd.sg_nents; - pc->rq = rq; return ide_floppy_issue_pc(drive, &cmd, pc); @@ -388,6 +374,8 @@ static int ide_floppy_get_capacity(ide_drive_t *drive) u8 pc_buf[256], header_len, desc_cnt; int i, rc = 1, blocks, length; + ide_debug_log(IDE_DBG_FUNC, "enter"); + drive->bios_cyl = 0; drive->bios_head = drive->bios_sect = 0; floppy->blocks = 0; @@ -488,8 +476,6 @@ static void ide_floppy_setup(ide_drive_t *drive) u16 *id = drive->id; drive->pc_callback = ide_floppy_callback; - drive->pc_update_buffers = idefloppy_update_buffers; - drive->pc_io_buffers = ide_io_buffers; /* * We used to check revisions here. At this point however I'm giving up. diff --git a/include/linux/ide.h b/include/linux/ide.h index 0f48fbd46028..836c4c6cb7e3 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -415,9 +415,6 @@ struct ide_atapi_pc { struct idetape_bh *bh; char *b_data; - struct scatterlist *sg; - unsigned int sg_cnt; - unsigned long timeout; }; @@ -1177,8 +1174,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *); void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int); void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int); -int ide_io_buffers(ide_drive_t *, struct ide_atapi_pc *, unsigned int, int); - extern void SELECT_DRIVE(ide_drive_t *); void SELECT_MASK(ide_drive_t *, int); From 985232e388714d4a9e94b4d96ee69b6ff8c9dc31 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:27 +0200 Subject: [PATCH 273/478] au1xxx-ide: auide_{in|out}sw() should be static Make auide_{insw|outsw}() 'static' and mark them 'inline' as there's only one call site for each: in the driver's {in|out}put_data() methods respectively... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/au1xxx-ide.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index 549bbb2755a8..1bfb43d0d3a8 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -50,7 +50,7 @@ static _auide_hwif auide_hwif; #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA) -void auide_insw(unsigned long port, void *addr, u32 count) +static inline void auide_insw(unsigned long port, void *addr, u32 count) { _auide_hwif *ahwif = &auide_hwif; chan_tab_t *ctp; @@ -68,7 +68,7 @@ void auide_insw(unsigned long port, void *addr, u32 count) ctp->cur_ptr = au1xxx_ddma_get_nextptr_virt(dp); } -void auide_outsw(unsigned long port, void *addr, u32 count) +static inline void auide_outsw(unsigned long port, void *addr, u32 count) { _auide_hwif *ahwif = &auide_hwif; chan_tab_t *ctp; From 8d64fcd9357798ad0d61f8877de13d5e1b1ab510 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:27 +0200 Subject: [PATCH 274/478] ide: identify data word 53 bit 1 doesn't cover words 62 and 63 (take 3) The IDE code assumed for years that the bit 1 of the identify data word 53 also covers the validity of the SW/MW DMA information in words 62 and 63, but it has always covered only words 64 thru 70, with words 62 and 63 being defined in the original ATA spec, not in ATA-2... This fix however should only concern *very* old hard disks and rather old CF cards... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/cs5530.c | 3 +-- drivers/ide/ide-dma-sff.c | 7 +++---- drivers/ide/ide-dma.c | 32 ++++++++++++++------------------ drivers/ide/sc1200.c | 3 +-- 4 files changed, 19 insertions(+), 26 deletions(-) diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c index 8e8b35a89901..40bf05eddf6e 100644 --- a/drivers/ide/cs5530.c +++ b/drivers/ide/cs5530.c @@ -92,8 +92,7 @@ static u8 cs5530_udma_filter(ide_drive_t *drive) if ((mateid[ATA_ID_FIELD_VALID] & 4) && (mateid[ATA_ID_UDMA_MODES] & 7)) goto out; - if ((mateid[ATA_ID_FIELD_VALID] & 2) && - (mateid[ATA_ID_MWDMA_MODES] & 7)) + if (mateid[ATA_ID_MWDMA_MODES] & 7) mask = 0; } out: diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c index f8c7d0cd2ff8..16fc46edc32d 100644 --- a/drivers/ide/ide-dma-sff.c +++ b/drivers/ide/ide-dma-sff.c @@ -38,10 +38,9 @@ int config_drive_for_dma(ide_drive_t *drive) * Enable DMA on any drive that has mode2 DMA * (multi or single) enabled */ - if (id[ATA_ID_FIELD_VALID] & 2) /* regular DMA */ - if ((id[ATA_ID_MWDMA_MODES] & 0x404) == 0x404 || - (id[ATA_ID_SWDMA_MODES] & 0x404) == 0x404) - return 1; + if ((id[ATA_ID_MWDMA_MODES] & 0x404) == 0x404 || + (id[ATA_ID_SWDMA_MODES] & 0x404) == 0x404) + return 1; /* Consult the list of known "good" drives */ if (ide_dma_good_drive(drive)) diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index a5612eadc306..f9c91752f275 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -245,12 +245,11 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode) case XFER_UDMA_0: if ((id[ATA_ID_FIELD_VALID] & 4) == 0) break; - + mask = id[ATA_ID_UDMA_MODES]; if (port_ops && port_ops->udma_filter) - mask = port_ops->udma_filter(drive); + mask &= port_ops->udma_filter(drive); else - mask = hwif->ultra_mask; - mask &= id[ATA_ID_UDMA_MODES]; + mask &= hwif->ultra_mask; /* * avoid false cable warning from eighty_ninty_three() @@ -261,18 +260,15 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode) } break; case XFER_MW_DMA_0: - if ((id[ATA_ID_FIELD_VALID] & 2) == 0) - break; + mask = id[ATA_ID_MWDMA_MODES]; if (port_ops && port_ops->mdma_filter) - mask = port_ops->mdma_filter(drive); + mask &= port_ops->mdma_filter(drive); else - mask = hwif->mwdma_mask; - mask &= id[ATA_ID_MWDMA_MODES]; + mask &= hwif->mwdma_mask; break; case XFER_SW_DMA_0: - if (id[ATA_ID_FIELD_VALID] & 2) { - mask = id[ATA_ID_SWDMA_MODES] & hwif->swdma_mask; - } else if (id[ATA_ID_OLD_DMA_MODES] >> 8) { + mask = id[ATA_ID_SWDMA_MODES]; + if (!(mask & ATA_SWDMA2) && (id[ATA_ID_OLD_DMA_MODES] >> 8)) { u8 mode = id[ATA_ID_OLD_DMA_MODES] >> 8; /* @@ -280,8 +276,9 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode) * (the maximum allowed mode is XFER_SW_DMA_2) */ if (mode <= 2) - mask = ((2 << mode) - 1) & hwif->swdma_mask; + mask = (2 << mode) - 1; } + mask &= hwif->swdma_mask; break; default: BUG(); @@ -398,11 +395,10 @@ int ide_id_dma_bug(ide_drive_t *drive) if ((id[ATA_ID_UDMA_MODES] >> 8) && (id[ATA_ID_MWDMA_MODES] >> 8)) goto err_out; - } else if (id[ATA_ID_FIELD_VALID] & 2) { - if ((id[ATA_ID_MWDMA_MODES] >> 8) && - (id[ATA_ID_SWDMA_MODES] >> 8)) - goto err_out; - } + } else if ((id[ATA_ID_MWDMA_MODES] >> 8) && + (id[ATA_ID_SWDMA_MODES] >> 8)) + goto err_out; + return 0; err_out: printk(KERN_ERR "%s: bad DMA info in identify block\n", drive->name); diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index 13e3988f00f0..d467478d68da 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -115,8 +115,7 @@ static u8 sc1200_udma_filter(ide_drive_t *drive) if ((mateid[ATA_ID_FIELD_VALID] & 4) && (mateid[ATA_ID_UDMA_MODES] & 7)) goto out; - if ((mateid[ATA_ID_FIELD_VALID] & 2) && - (mateid[ATA_ID_MWDMA_MODES] & 7)) + if (mateid[ATA_ID_MWDMA_MODES] & 7) mask = 0; } out: From c4199930b119eb9c1ffb102ed57eaac4d4424d08 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:27 +0200 Subject: [PATCH 275/478] ide-iops: only clear DMA words on setting DMA mode The bytes indicating current DMA mode in the identify data words 62, 63, and 88 should only change on setting a DMA mode, so stop clearing them on setting PIO mode in ide_config_drive_speed(). While at it, correct SW/MW DMA mode masks... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-iops.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 5403e4a44be4..0e62698146b6 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -386,9 +386,11 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) return error; } - id[ATA_ID_UDMA_MODES] &= ~0xFF00; - id[ATA_ID_MWDMA_MODES] &= ~0x0F00; - id[ATA_ID_SWDMA_MODES] &= ~0x0F00; + if (speed >= XFER_SW_DMA_0) { + id[ATA_ID_UDMA_MODES] &= ~0xFF00; + id[ATA_ID_MWDMA_MODES] &= ~0x0700; + id[ATA_ID_SWDMA_MODES] &= ~0x0700; + } skip: #ifdef CONFIG_BLK_DEV_IDEDMA From 74638c84821c066d02c158bc843c84499ddc9764 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:28 +0200 Subject: [PATCH 276/478] ide: add support for CFA specified transfer modes (take 3) Add support for the CompactFlash specific PIO modes 5/6 and MWDMA modes 3/4. Since there were no PIO5 capable hard drives produced and one would also need 66 MHz IDE clock to actually get the difference WRT the address setup timings programmed, I decided to simply replace the old non-standard PIO mode 5 timings with the CFA specified ones. Signed-off-by: Sergei Shtylyov Cc: stf_xl@wp.pl Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-dma.c | 8 ++++++++ drivers/ide/ide-iops.c | 12 +++++++++++- drivers/ide/ide-timings.c | 12 ++++++++++-- drivers/ide/ide-xfer-mode.c | 15 +++++++++------ drivers/ide/sl82c105.c | 3 ++- 5 files changed, 40 insertions(+), 10 deletions(-) diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index f9c91752f275..a0b8cab1d9a6 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -261,6 +261,14 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode) break; case XFER_MW_DMA_0: mask = id[ATA_ID_MWDMA_MODES]; + + /* Also look for the CF specific MWDMA modes... */ + if (ata_id_is_cfa(id) && (id[ATA_ID_CFA_MODES] & 0x38)) { + u8 mode = ((id[ATA_ID_CFA_MODES] & 0x38) >> 3) - 1; + + mask |= ((2 << mode) - 1) << 3; + } + if (port_ops && port_ops->mdma_filter) mask &= port_ops->mdma_filter(drive); else diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 0e62698146b6..0caca342802d 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -306,6 +306,7 @@ int ide_driveid_update(ide_drive_t *drive) drive->id[ATA_ID_UDMA_MODES] = id[ATA_ID_UDMA_MODES]; drive->id[ATA_ID_MWDMA_MODES] = id[ATA_ID_MWDMA_MODES]; drive->id[ATA_ID_SWDMA_MODES] = id[ATA_ID_SWDMA_MODES]; + drive->id[ATA_ID_CFA_MODES] = id[ATA_ID_CFA_MODES]; /* anything more ? */ kfree(id); @@ -390,7 +391,10 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) id[ATA_ID_UDMA_MODES] &= ~0xFF00; id[ATA_ID_MWDMA_MODES] &= ~0x0700; id[ATA_ID_SWDMA_MODES] &= ~0x0700; - } + if (ata_id_is_cfa(id)) + id[ATA_ID_CFA_MODES] &= ~0x0E00; + } else if (ata_id_is_cfa(id)) + id[ATA_ID_CFA_MODES] &= ~0x01C0; skip: #ifdef CONFIG_BLK_DEV_IDEDMA @@ -403,12 +407,18 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) if (speed >= XFER_UDMA_0) { i = 1 << (speed - XFER_UDMA_0); id[ATA_ID_UDMA_MODES] |= (i << 8 | i); + } else if (ata_id_is_cfa(id) && speed >= XFER_MW_DMA_3) { + i = speed - XFER_MW_DMA_2; + id[ATA_ID_CFA_MODES] |= i << 9; } else if (speed >= XFER_MW_DMA_0) { i = 1 << (speed - XFER_MW_DMA_0); id[ATA_ID_MWDMA_MODES] |= (i << 8 | i); } else if (speed >= XFER_SW_DMA_0) { i = 1 << (speed - XFER_SW_DMA_0); id[ATA_ID_SWDMA_MODES] |= (i << 8 | i); + } else if (ata_id_is_cfa(id) && speed >= XFER_PIO_5) { + i = speed - XFER_PIO_4; + id[ATA_ID_CFA_MODES] |= i << 6; } if (!drive->init_speed) diff --git a/drivers/ide/ide-timings.c b/drivers/ide/ide-timings.c index 81f527af8fae..001a56365be5 100644 --- a/drivers/ide/ide-timings.c +++ b/drivers/ide/ide-timings.c @@ -43,6 +43,8 @@ static struct ide_timing ide_timing[] = { { XFER_UDMA_1, 0, 0, 0, 0, 0, 0, 0, 80 }, { XFER_UDMA_0, 0, 0, 0, 0, 0, 0, 0, 120 }, + { XFER_MW_DMA_4, 25, 0, 0, 0, 55, 20, 80, 0 }, + { XFER_MW_DMA_3, 25, 0, 0, 0, 65, 25, 100, 0 }, { XFER_MW_DMA_2, 25, 0, 0, 0, 70, 25, 120, 0 }, { XFER_MW_DMA_1, 45, 0, 0, 0, 80, 50, 150, 0 }, { XFER_MW_DMA_0, 60, 0, 0, 0, 215, 215, 480, 0 }, @@ -51,7 +53,8 @@ static struct ide_timing ide_timing[] = { { XFER_SW_DMA_1, 90, 0, 0, 0, 240, 240, 480, 0 }, { XFER_SW_DMA_0, 120, 0, 0, 0, 480, 480, 960, 0 }, - { XFER_PIO_5, 20, 50, 30, 100, 50, 30, 100, 0 }, + { XFER_PIO_6, 10, 55, 20, 80, 55, 20, 80, 0 }, + { XFER_PIO_5, 15, 65, 25, 100, 65, 25, 100, 0 }, { XFER_PIO_4, 25, 70, 25, 120, 70, 25, 120, 0 }, { XFER_PIO_3, 30, 80, 70, 180, 80, 70, 180, 0 }, @@ -90,6 +93,10 @@ u16 ide_pio_cycle_time(ide_drive_t *drive, u8 pio) /* conservative "downgrade" for all pre-ATA2 drives */ if (pio < 3 && cycle < t->cycle) cycle = 0; /* use standard timing */ + + /* Use the standard timing for the CF specific modes too */ + if (pio > 4 && ata_id_is_cfa(id)) + cycle = 0; } return cycle ? cycle : t->cycle; @@ -161,7 +168,8 @@ int ide_timing_compute(ide_drive_t *drive, u8 speed, if (speed <= XFER_PIO_2) p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO]; - else if (speed <= XFER_PIO_5) + else if ((speed <= XFER_PIO_4) || + (speed == XFER_PIO_5 && !ata_id_is_cfa(id))) p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO_IORDY]; else if (speed >= XFER_MW_DMA_0 && speed <= XFER_MW_DMA_2) p.cycle = id[ATA_ID_EIDE_DMA_MIN]; diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index 6910f6a257e8..af44be9d546c 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -9,11 +9,11 @@ static const char *udma_str[] = { "UDMA/16", "UDMA/25", "UDMA/33", "UDMA/44", "UDMA/66", "UDMA/100", "UDMA/133", "UDMA7" }; static const char *mwdma_str[] = - { "MWDMA0", "MWDMA1", "MWDMA2" }; + { "MWDMA0", "MWDMA1", "MWDMA2", "MWDMA3", "MWDMA4" }; static const char *swdma_str[] = { "SWDMA0", "SWDMA1", "SWDMA2" }; static const char *pio_str[] = - { "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5" }; + { "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5", "PIO6" }; /** * ide_xfer_verbose - return IDE mode names @@ -30,11 +30,11 @@ const char *ide_xfer_verbose(u8 mode) if (mode >= XFER_UDMA_0 && mode <= XFER_UDMA_7) s = udma_str[i]; - else if (mode >= XFER_MW_DMA_0 && mode <= XFER_MW_DMA_2) + else if (mode >= XFER_MW_DMA_0 && mode <= XFER_MW_DMA_4) s = mwdma_str[i]; else if (mode >= XFER_SW_DMA_0 && mode <= XFER_SW_DMA_2) s = swdma_str[i]; - else if (mode >= XFER_PIO_0 && mode <= XFER_PIO_5) + else if (mode >= XFER_PIO_0 && mode <= XFER_PIO_6) s = pio_str[i & 0x7]; else if (mode == XFER_PIO_SLOW) s = "PIO SLOW"; @@ -79,7 +79,10 @@ u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode) } if (id[ATA_ID_FIELD_VALID] & 2) { /* ATA2? */ - if (ata_id_has_iordy(id)) { + if (ata_id_is_cfa(id) && (id[ATA_ID_CFA_MODES] & 7)) + pio_mode = 4 + min_t(int, 2, + id[ATA_ID_CFA_MODES] & 7); + else if (ata_id_has_iordy(id)) { if (id[ATA_ID_PIO_MODES] & 7) { overridden = 0; if (id[ATA_ID_PIO_MODES] & 4) @@ -239,7 +242,7 @@ int ide_set_xfer_rate(ide_drive_t *drive, u8 rate) BUG_ON(rate < XFER_PIO_0); - if (rate >= XFER_PIO_0 && rate <= XFER_PIO_5) + if (rate >= XFER_PIO_0 && rate <= XFER_PIO_6) return ide_set_pio_mode(drive, rate); return ide_set_dma_mode(drive, rate); diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c index d6f8977191c8..b0a460625335 100644 --- a/drivers/ide/sl82c105.c +++ b/drivers/ide/sl82c105.c @@ -61,7 +61,8 @@ static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio) if (cmd_off == 0) cmd_off = 1; - if (pio > 2 || ata_id_has_iordy(drive->id)) + if ((pio > 2 || ata_id_has_iordy(drive->id)) && + !(pio > 4 && ata_id_is_cfa(drive->id))) iordy = 0x40; return (cmd_on - 1) << 8 | (cmd_off - 1) | iordy; From 47ab834854d4639fedf2ed2f21b41297f2abe1a7 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:29 +0200 Subject: [PATCH 277/478] ide-disk: use ATA_ERR Make use of ATA_ERR instead of hard-coded value in idedisk_set_max_address() and idedisk_read_native_max_address(). Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index ca934c8a1289..c998cf8e971a 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -227,7 +227,7 @@ static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48) ide_no_data_taskfile(drive, &cmd); /* if OK, compute maximum address value */ - if ((tf->status & 0x01) == 0) + if (!(tf->status & ATA_ERR)) addr = ide_get_lba_addr(tf, lba48) + 1; return addr; @@ -267,7 +267,7 @@ static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48) ide_no_data_taskfile(drive, &cmd); /* if OK, compute maximum address value */ - if ((tf->status & 0x01) == 0) + if (!(tf->status & ATA_ERR)) addr_set = ide_get_lba_addr(tf, lba48) + 1; return addr_set; From 4d74c3fcf2b90487eacec511bc8c07177711c81c Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:29 +0200 Subject: [PATCH 278/478] ide: use ATA_HOB Make use of ATA_HOB instead of hard-coded value in the tf_read() method. Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/at91_ide.c | 4 ++-- drivers/ide/ide-h8300.c | 4 ++-- drivers/ide/ide-io-std.c | 4 ++-- drivers/ide/ns87415.c | 4 ++-- drivers/ide/scc_pata.c | 4 ++-- drivers/ide/tx4938ide.c | 4 ++-- drivers/ide/tx4939ide.c | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index 6bb45a2b24c2..8fc6ae958b0b 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -242,7 +242,7 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } /* be sure we're looking at the low order bits */ - ide_mm_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) tf->feature = ide_mm_inb(io_ports->feature_addr); @@ -258,7 +258,7 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf->device = ide_mm_inb(io_ports->device_addr); if (cmd->tf_flags & IDE_TFLAG_LBA48) { - ide_mm_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + ide_mm_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = ide_mm_inb(io_ports->feature_addr); diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c index ff8339ed59ab..7492f28d1290 100644 --- a/drivers/ide/ide-h8300.c +++ b/drivers/ide/ide-h8300.c @@ -98,7 +98,7 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } /* be sure we're looking at the low order bits */ - outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) tf->feature = inb(io_ports->feature_addr); @@ -114,7 +114,7 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf->device = inb(io_ports->device_addr); if (cmd->tf_flags & IDE_TFLAG_LBA48) { - outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = inb(io_ports->feature_addr); diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 2d9c6dc3f956..3a867e49a0af 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -166,7 +166,7 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } /* be sure we're looking at the low order bits */ - tf_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) tf->feature = tf_inb(io_ports->feature_addr); @@ -182,7 +182,7 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf->device = tf_inb(io_ports->device_addr); if (cmd->tf_flags & IDE_TFLAG_LBA48) { - tf_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = tf_inb(io_ports->feature_addr); diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index cbc1d1110385..13a9e00efa13 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -74,7 +74,7 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } /* be sure we're looking at the low order bits */ - outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) tf->feature = inb(io_ports->feature_addr); @@ -90,7 +90,7 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf->device = superio_ide_inb(io_ports->device_addr); if (cmd->tf_flags & IDE_TFLAG_LBA48) { - outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = inb(io_ports->feature_addr); diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 542419ffa9b7..6e47eac1cd7f 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -709,7 +709,7 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } /* be sure we're looking at the low order bits */ - scc_ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) tf->feature = scc_ide_inb(io_ports->feature_addr); @@ -725,7 +725,7 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf->device = scc_ide_inb(io_ports->device_addr); if (cmd->tf_flags & IDE_TFLAG_LBA48) { - scc_ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = scc_ide_inb(io_ports->feature_addr); diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index 657a61890b1c..1c4a78ac1a20 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -142,7 +142,7 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } /* be sure we're looking at the low order bits */ - tx4938ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) tf->feature = tx4938ide_inb(io_ports->feature_addr); @@ -158,7 +158,7 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf->device = tx4938ide_inb(io_ports->device_addr); if (cmd->tf_flags & IDE_TFLAG_LBA48) { - tx4938ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + tx4938ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index faf0d97b9ce8..77aee5b2ce95 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -509,7 +509,7 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } /* be sure we're looking at the low order bits */ - tx4939ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) tf->feature = tx4939ide_inb(io_ports->feature_addr); @@ -525,7 +525,7 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf->device = tx4939ide_inb(io_ports->device_addr); if (cmd->tf_flags & IDE_TFLAG_LBA48) { - tx4939ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + tx4939ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = From ecf3a31d2a08a419bdf919456f1724f5b72bde2c Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:30 +0200 Subject: [PATCH 279/478] ide: turn set_irq() method into write_devctl() method Turn set_irq() method with its software reset hack into write_devctl() method (for just writing a value into the device control register) at last... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/at91_ide.c | 2 +- drivers/ide/au1xxx-ide.c | 3 +-- drivers/ide/falconide.c | 3 +-- drivers/ide/ide-eh.c | 7 +++---- drivers/ide/ide-h8300.c | 3 +-- drivers/ide/ide-io-std.c | 16 +++------------- drivers/ide/ide-io.c | 4 +++- drivers/ide/ide-iops.c | 4 ++-- drivers/ide/ide-pm.c | 2 +- drivers/ide/ide-probe.c | 6 +++--- drivers/ide/ide-taskfile.c | 2 +- drivers/ide/ns87415.c | 3 +-- drivers/ide/pmac.c | 14 ++------------ drivers/ide/q40ide.c | 3 +-- drivers/ide/scc_pata.c | 14 ++------------ drivers/ide/sgiioc4.c | 3 +-- drivers/ide/tx4938ide.c | 3 +-- drivers/ide/tx4939ide.c | 6 ++---- include/linux/ide.h | 6 ++---- 19 files changed, 32 insertions(+), 72 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index 8fc6ae958b0b..e6e96743aa7b 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -295,7 +295,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = at91_ide_tf_load, .tf_read = at91_ide_tf_read, diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index 1bfb43d0d3a8..2ca10d533dad 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -467,8 +467,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c index b368a5effc3a..5063be85dc33 100644 --- a/drivers/ide/falconide.c +++ b/drivers/ide/falconide.c @@ -89,8 +89,7 @@ static const struct ide_tp_ops falconide_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 11664976eea3..de4b7f1c9c9f 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -401,15 +401,14 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi) * immediate interrupt due to the edge transition it produces. * This single interrupt gives us a "fast poll" for drives that * recover from reset very quickly, saving us the first 50ms wait time. - * - * TODO: add ->softreset method and stop abusing ->set_irq */ /* set SRST and nIEN */ - tp_ops->set_irq(hwif, 4); + tp_ops->write_devctl(hwif, ATA_SRST | ATA_NIEN | ATA_DEVCTL_OBS); /* more than enough time */ udelay(10); /* clear SRST, leave nIEN (unless device is on the quirk list) */ - tp_ops->set_irq(hwif, drive->quirk_list == 2); + tp_ops->write_devctl(hwif, (drive->quirk_list == 2 ? 0 : ATA_NIEN) | + ATA_DEVCTL_OBS); /* more than enough time */ udelay(10); hwif->poll_timeout = jiffies + WAIT_WORSTCASE; diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c index 7492f28d1290..a57ccad61acf 100644 --- a/drivers/ide/ide-h8300.c +++ b/drivers/ide/ide-h8300.c @@ -159,8 +159,7 @@ static const struct ide_tp_ops h8300_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = h8300_tf_load, .tf_read = h8300_tf_read, diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 3a867e49a0af..bbeedce6b17d 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -64,23 +64,14 @@ u8 ide_read_altstatus(ide_hwif_t *hwif) } EXPORT_SYMBOL_GPL(ide_read_altstatus); -void ide_set_irq(ide_hwif_t *hwif, int on) +void ide_write_devctl(ide_hwif_t *hwif, u8 ctl) { - u8 ctl = ATA_DEVCTL_OBS; - - if (on == 4) { /* hack for SRST */ - ctl |= 4; - on &= ~4; - } - - ctl |= on ? 0 : 2; - if (hwif->host_flags & IDE_HFLAG_MMIO) writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr); else outb(ctl, hwif->io_ports.ctl_addr); } -EXPORT_SYMBOL_GPL(ide_set_irq); +EXPORT_SYMBOL_GPL(ide_write_devctl); void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) { @@ -312,8 +303,7 @@ const struct ide_tp_ops default_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 3c52317d8524..5b57905a7d71 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -494,7 +494,9 @@ repeat: * quirk_list may not like intr setups/cleanups */ if (prev_port && prev_port->cur_dev->quirk_list == 0) - prev_port->tp_ops->set_irq(prev_port, 0); + prev_port->tp_ops->write_devctl(prev_port, + ATA_NIEN | + ATA_DEVCTL_OBS); hwif->host->cur_port = hwif; } diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 0caca342802d..ae227dd8466f 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -360,7 +360,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) SELECT_DRIVE(drive); SELECT_MASK(drive, 1); udelay(1); - tp_ops->set_irq(hwif, 0); + tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS); memset(&cmd, 0, sizeof(cmd)); cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT; @@ -372,7 +372,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES); if (drive->quirk_list == 2) - tp_ops->set_irq(hwif, 1); + tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); error = __ide_wait_stat(drive, drive->ready_stat, ATA_BUSY | ATA_DRQ | ATA_ERR, diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index ebf2d21ebdcb..20553d4c42a2 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -233,7 +233,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq) if (rc) printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name); SELECT_DRIVE(drive); - hwif->tp_ops->set_irq(hwif, 1); + hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); rc = ide_wait_not_busy(hwif, 100000); if (rc) printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name); diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 7c1f1bf81836..d240f76b0da6 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -260,7 +260,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) * during the identify phase that the IRQ handler isn't expecting. */ if (io_ports->ctl_addr) - tp_ops->set_irq(hwif, 0); + tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS); /* take a deep breath */ msleep(50); @@ -628,7 +628,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif) if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 || (drive->dev_flags & IDE_DFLAG_PRESENT)) { SELECT_DRIVE(drive); - hwif->tp_ops->set_irq(hwif, 1); + hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); mdelay(2); rc = ide_wait_not_busy(hwif, 35000); if (rc) @@ -845,7 +845,7 @@ static int init_irq (ide_hwif_t *hwif) irq_handler = ide_intr; if (io_ports->ctl_addr) - hwif->tp_ops->set_irq(hwif, 1); + hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif)) goto out_up; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index dba68db629bf..47f13cd11031 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -80,7 +80,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd) if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) { ide_tf_dump(drive->name, tf); - tp_ops->set_irq(hwif, 1); + tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); SELECT_MASK(drive, 0); tp_ops->tf_load(drive, cmd); } diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 13a9e00efa13..00ab0be7335a 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -109,8 +109,7 @@ static const struct ide_tp_ops superio_tp_ops = { .exec_command = ide_exec_command, .read_status = superio_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = ide_tf_load, .tf_read = superio_tf_read, diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 879c3d8d9f36..7aa45ea37eeb 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -476,17 +476,8 @@ static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd) + IDE_TIMING_CONFIG)); } -static void pmac_set_irq(ide_hwif_t *hwif, int on) +static void pmac_write_devctl(ide_hwif_t *hwif, u8 ctl) { - u8 ctl = ATA_DEVCTL_OBS; - - if (on == 4) { /* hack for SRST */ - ctl |= 4; - on &= ~4; - } - - ctl |= on ? 0 : 2; - writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr); (void)readl((void __iomem *)(hwif->io_ports.data_addr + IDE_TIMING_CONFIG)); @@ -954,8 +945,7 @@ static const struct ide_tp_ops pmac_tp_ops = { .exec_command = pmac_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = pmac_set_irq, + .write_devctl = pmac_write_devctl, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c index 2a43a2f49633..7fddfd34fcce 100644 --- a/drivers/ide/q40ide.c +++ b/drivers/ide/q40ide.c @@ -99,8 +99,7 @@ static const struct ide_tp_ops q40ide_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 6e47eac1cd7f..6ba4983d831c 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -148,17 +148,8 @@ static u8 scc_dma_sff_read_status(ide_hwif_t *hwif) return (u8)in_be32((void *)(hwif->dma_base + 4)); } -static void scc_set_irq(ide_hwif_t *hwif, int on) +static void scc_write_devctl(ide_hwif_t *hwif, u8 ctl) { - u8 ctl = ATA_DEVCTL_OBS; - - if (on == 4) { /* hack for SRST */ - ctl |= 4; - on &= ~4; - } - - ctl |= on ? 0 : 2; - out_be32((void *)hwif->io_ports.ctl_addr, ctl); eieio(); in_be32((void *)(hwif->dma_base + 0x01c)); @@ -843,8 +834,7 @@ static const struct ide_tp_ops scc_tp_ops = { .exec_command = scc_exec_command, .read_status = scc_read_status, .read_altstatus = scc_read_altstatus, - - .set_irq = scc_set_irq, + .write_devctl = scc_write_devctl, .tf_load = scc_tf_load, .tf_read = scc_tf_read, diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index 6ef5a567d377..58980fcafc3b 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -503,8 +503,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = { .exec_command = ide_exec_command, .read_status = sgiioc4_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index 1c4a78ac1a20..ec3aa32fbbe0 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -204,8 +204,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = tx4938ide_tf_load, .tf_read = tx4938ide_tf_read, diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 77aee5b2ce95..43bc0372413a 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -571,8 +571,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = tx4939ide_tf_load, .tf_read = tx4939ide_tf_read, @@ -595,8 +594,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = { .exec_command = ide_exec_command, .read_status = ide_read_status, .read_altstatus = ide_read_altstatus, - - .set_irq = ide_set_irq, + .write_devctl = ide_write_devctl, .tf_load = tx4939ide_tf_load, .tf_read = ide_tf_read, diff --git a/include/linux/ide.h b/include/linux/ide.h index 836c4c6cb7e3..ccb70abe991b 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -655,8 +655,7 @@ struct ide_tp_ops { void (*exec_command)(struct hwif_s *, u8); u8 (*read_status)(struct hwif_s *); u8 (*read_altstatus)(struct hwif_s *); - - void (*set_irq)(struct hwif_s *, int); + void (*write_devctl)(struct hwif_s *, u8); void (*tf_load)(ide_drive_t *, struct ide_cmd *); void (*tf_read)(ide_drive_t *, struct ide_cmd *); @@ -1165,8 +1164,7 @@ void ide_tf_dump(const char *, struct ide_taskfile *); void ide_exec_command(ide_hwif_t *, u8); u8 ide_read_status(ide_hwif_t *); u8 ide_read_altstatus(ide_hwif_t *); - -void ide_set_irq(ide_hwif_t *, int); +void ide_write_devctl(ide_hwif_t *, u8); void ide_tf_load(ide_drive_t *, struct ide_cmd *); void ide_tf_read(ide_drive_t *, struct ide_cmd *); From 6762511934e6e7287ce3c8baac0d52ef64e3787b Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:30 +0200 Subject: [PATCH 280/478] ide: rename IDE_TFLAG_IN_[HOB_]FEATURE The feature register has never been readable -- when its location is read, one gets the error register value; hence rename IDE_TFLAG_IN_[HOB_]FEATURE into IDE_TFLAG_IN_[HOB_]ERROR and introduce the 'hob_error' field into the 'struct ide_taskfile' (despite the error register not really depending on the HOB bit). Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/at91_ide.c | 16 ++++++++-------- drivers/ide/ide-h8300.c | 16 ++++++++-------- drivers/ide/ide-io-std.c | 16 ++++++++-------- drivers/ide/ide-iops.c | 2 +- drivers/ide/ns87415.c | 16 ++++++++-------- drivers/ide/scc_pata.c | 16 ++++++++-------- drivers/ide/tx4938ide.c | 17 ++++++++--------- drivers/ide/tx4939ide.c | 17 ++++++++--------- include/linux/ide.h | 12 ++++++++---- 9 files changed, 65 insertions(+), 63 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index e6e96743aa7b..9dce793d93b4 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -244,8 +244,8 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) /* be sure we're looking at the low order bits */ ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) - tf->feature = ide_mm_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_ERROR) + tf->error = ide_mm_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = ide_mm_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_LBAL) @@ -260,16 +260,16 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_LBA48) { ide_mm_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) - tf->hob_feature = ide_mm_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR) + tf->hob_error = ide_mm_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT) - tf->hob_nsect = ide_mm_inb(io_ports->nsect_addr); + tf->hob_nsect = ide_mm_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL) - tf->hob_lbal = ide_mm_inb(io_ports->lbal_addr); + tf->hob_lbal = ide_mm_inb(io_ports->lbal_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM) - tf->hob_lbam = ide_mm_inb(io_ports->lbam_addr); + tf->hob_lbam = ide_mm_inb(io_ports->lbam_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH) - tf->hob_lbah = ide_mm_inb(io_ports->lbah_addr); + tf->hob_lbah = ide_mm_inb(io_ports->lbah_addr); } } diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c index a57ccad61acf..1d45cd5b6a1c 100644 --- a/drivers/ide/ide-h8300.c +++ b/drivers/ide/ide-h8300.c @@ -100,8 +100,8 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) /* be sure we're looking at the low order bits */ outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) - tf->feature = inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_ERROR) + tf->error = inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_LBAL) @@ -116,16 +116,16 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_LBA48) { outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) - tf->hob_feature = inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR) + tf->hob_error = inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT) - tf->hob_nsect = inb(io_ports->nsect_addr); + tf->hob_nsect = inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL) - tf->hob_lbal = inb(io_ports->lbal_addr); + tf->hob_lbal = inb(io_ports->lbal_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM) - tf->hob_lbam = inb(io_ports->lbam_addr); + tf->hob_lbam = inb(io_ports->lbam_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH) - tf->hob_lbah = inb(io_ports->lbah_addr); + tf->hob_lbah = inb(io_ports->lbah_addr); } } diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index bbeedce6b17d..31f5c5f4c093 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -159,8 +159,8 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) /* be sure we're looking at the low order bits */ tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) - tf->feature = tf_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_ERROR) + tf->error = tf_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = tf_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_LBAL) @@ -175,16 +175,16 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_LBA48) { tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) - tf->hob_feature = tf_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR) + tf->hob_error = tf_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT) - tf->hob_nsect = tf_inb(io_ports->nsect_addr); + tf->hob_nsect = tf_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL) - tf->hob_lbal = tf_inb(io_ports->lbal_addr); + tf->hob_lbal = tf_inb(io_ports->lbal_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM) - tf->hob_lbam = tf_inb(io_ports->lbam_addr); + tf->hob_lbam = tf_inb(io_ports->lbam_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH) - tf->hob_lbah = tf_inb(io_ports->lbah_addr); + tf->hob_lbah = tf_inb(io_ports->lbah_addr); } } EXPORT_SYMBOL_GPL(ide_tf_read); diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index ae227dd8466f..6f363a26700d 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -55,7 +55,7 @@ u8 ide_read_error(ide_drive_t *drive) struct ide_cmd cmd; memset(&cmd, 0, sizeof(cmd)); - cmd.tf_flags = IDE_TFLAG_IN_FEATURE; + cmd.tf_flags = IDE_TFLAG_IN_ERROR; drive->hwif->tp_ops->tf_read(drive, &cmd); diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 00ab0be7335a..0a6cf74c3265 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -76,8 +76,8 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) /* be sure we're looking at the low order bits */ outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) - tf->feature = inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_ERROR) + tf->error = inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_LBAL) @@ -92,16 +92,16 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_LBA48) { outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) - tf->hob_feature = inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR) + tf->hob_error = inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT) - tf->hob_nsect = inb(io_ports->nsect_addr); + tf->hob_nsect = inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL) - tf->hob_lbal = inb(io_ports->lbal_addr); + tf->hob_lbal = inb(io_ports->lbal_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM) - tf->hob_lbam = inb(io_ports->lbam_addr); + tf->hob_lbam = inb(io_ports->lbam_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH) - tf->hob_lbah = inb(io_ports->lbah_addr); + tf->hob_lbah = inb(io_ports->lbah_addr); } } diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 6ba4983d831c..ea0a9752c6f9 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -702,8 +702,8 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) /* be sure we're looking at the low order bits */ scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) - tf->feature = scc_ide_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_ERROR) + tf->error = scc_ide_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = scc_ide_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_LBAL) @@ -718,16 +718,16 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_LBA48) { scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) - tf->hob_feature = scc_ide_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR) + tf->hob_error = scc_ide_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT) - tf->hob_nsect = scc_ide_inb(io_ports->nsect_addr); + tf->hob_nsect = scc_ide_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL) - tf->hob_lbal = scc_ide_inb(io_ports->lbal_addr); + tf->hob_lbal = scc_ide_inb(io_ports->lbal_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM) - tf->hob_lbam = scc_ide_inb(io_ports->lbam_addr); + tf->hob_lbam = scc_ide_inb(io_ports->lbam_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH) - tf->hob_lbah = scc_ide_inb(io_ports->lbah_addr); + tf->hob_lbah = scc_ide_inb(io_ports->lbah_addr); } } diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index ec3aa32fbbe0..606c37f5267d 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -144,8 +144,8 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) /* be sure we're looking at the low order bits */ tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) - tf->feature = tx4938ide_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_ERROR) + tf->error = tx4938ide_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = tx4938ide_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_LBAL) @@ -160,17 +160,16 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_LBA48) { tx4938ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) - tf->hob_feature = - tx4938ide_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR) + tf->hob_error = tx4938ide_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT) - tf->hob_nsect = tx4938ide_inb(io_ports->nsect_addr); + tf->hob_nsect = tx4938ide_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL) - tf->hob_lbal = tx4938ide_inb(io_ports->lbal_addr); + tf->hob_lbal = tx4938ide_inb(io_ports->lbal_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM) - tf->hob_lbam = tx4938ide_inb(io_ports->lbam_addr); + tf->hob_lbam = tx4938ide_inb(io_ports->lbam_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH) - tf->hob_lbah = tx4938ide_inb(io_ports->lbah_addr); + tf->hob_lbah = tx4938ide_inb(io_ports->lbah_addr); } } diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 43bc0372413a..f1e9da71110c 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -511,8 +511,8 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) /* be sure we're looking at the low order bits */ tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE) - tf->feature = tx4939ide_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_ERROR) + tf->error = tx4939ide_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = tx4939ide_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_LBAL) @@ -527,17 +527,16 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->tf_flags & IDE_TFLAG_LBA48) { tx4939ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr); - if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) - tf->hob_feature = - tx4939ide_inb(io_ports->feature_addr); + if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR) + tf->hob_error = tx4939ide_inb(io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT) - tf->hob_nsect = tx4939ide_inb(io_ports->nsect_addr); + tf->hob_nsect = tx4939ide_inb(io_ports->nsect_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL) - tf->hob_lbal = tx4939ide_inb(io_ports->lbal_addr); + tf->hob_lbal = tx4939ide_inb(io_ports->lbal_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM) - tf->hob_lbam = tx4939ide_inb(io_ports->lbam_addr); + tf->hob_lbam = tx4939ide_inb(io_ports->lbam_addr); if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH) - tf->hob_lbah = tx4939ide_inb(io_ports->lbah_addr); + tf->hob_lbah = tx4939ide_inb(io_ports->lbah_addr); } } diff --git a/include/linux/ide.h b/include/linux/ide.h index ccb70abe991b..e919c865f0c7 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -265,7 +265,7 @@ enum { IDE_TFLAG_WRITE = (1 << 12), IDE_TFLAG_CUSTOM_HANDLER = (1 << 13), IDE_TFLAG_DMA_PIO_FALLBACK = (1 << 14), - IDE_TFLAG_IN_HOB_FEATURE = (1 << 15), + IDE_TFLAG_IN_HOB_ERROR = (1 << 15), IDE_TFLAG_IN_HOB_NSECT = (1 << 16), IDE_TFLAG_IN_HOB_LBAL = (1 << 17), IDE_TFLAG_IN_HOB_LBAM = (1 << 18), @@ -273,10 +273,10 @@ enum { IDE_TFLAG_IN_HOB_LBA = IDE_TFLAG_IN_HOB_LBAL | IDE_TFLAG_IN_HOB_LBAM | IDE_TFLAG_IN_HOB_LBAH, - IDE_TFLAG_IN_HOB = IDE_TFLAG_IN_HOB_FEATURE | + IDE_TFLAG_IN_HOB = IDE_TFLAG_IN_HOB_ERROR | IDE_TFLAG_IN_HOB_NSECT | IDE_TFLAG_IN_HOB_LBA, - IDE_TFLAG_IN_FEATURE = (1 << 20), + IDE_TFLAG_IN_ERROR = (1 << 20), IDE_TFLAG_IN_NSECT = (1 << 21), IDE_TFLAG_IN_LBAL = (1 << 22), IDE_TFLAG_IN_LBAM = (1 << 23), @@ -310,8 +310,12 @@ enum { struct ide_taskfile { u8 hob_data; /* 0: high data byte (for TASKFILE IOCTL) */ + /* 1-5: additional data to support LBA48 */ + union { + u8 hob_error; /* read: error */ + u8 hob_feature; /* write: feature */ + }; - u8 hob_feature; /* 1-5: additional data to support LBA48 */ u8 hob_nsect; u8 hob_lbal; u8 hob_lbam; From deae17fd5d147ae65e277905343b7ea578574d12 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:31 +0200 Subject: [PATCH 281/478] ide-io-std: shorten ide_{in|out}put_data() ide_{in|out|put_data() can be somewhat shortened by merging the paths doing 16-bit I/O... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-io-std.c | 60 +++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 31f5c5f4c093..96a537da8925 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -216,11 +216,10 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, ide_hwif_t *hwif = drive->hwif; struct ide_io_ports *io_ports = &hwif->io_ports; unsigned long data_addr = io_ports->data_addr; + unsigned int words = (len + 1) >> 1; u8 io_32bit = drive->io_32bit; u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; - len++; - if (io_32bit) { unsigned long uninitialized_var(flags); @@ -229,27 +228,26 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, ata_vlb_sync(io_ports->nsect_addr); } + words >>= 1; if (mmio) - __ide_mm_insl((void __iomem *)data_addr, buf, len / 4); + __ide_mm_insl((void __iomem *)data_addr, buf, words); else - insl(data_addr, buf, len / 4); + insl(data_addr, buf, words); if ((io_32bit & 2) && !mmio) local_irq_restore(flags); - if ((len & 3) >= 2) { - if (mmio) - __ide_mm_insw((void __iomem *)data_addr, - (u8 *)buf + (len & ~3), 1); - else - insw(data_addr, (u8 *)buf + (len & ~3), 1); - } - } else { - if (mmio) - __ide_mm_insw((void __iomem *)data_addr, buf, len / 2); - else - insw(data_addr, buf, len / 2); + if (((len + 1) & 3) < 2) + return; + + buf += len & ~3; + words = 1; } + + if (mmio) + __ide_mm_insw((void __iomem *)data_addr, buf, words); + else + insw(data_addr, buf, words); } EXPORT_SYMBOL_GPL(ide_input_data); @@ -262,11 +260,10 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, ide_hwif_t *hwif = drive->hwif; struct ide_io_ports *io_ports = &hwif->io_ports; unsigned long data_addr = io_ports->data_addr; + unsigned int words = (len + 1) >> 1; u8 io_32bit = drive->io_32bit; u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; - len++; - if (io_32bit) { unsigned long uninitialized_var(flags); @@ -275,27 +272,26 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, ata_vlb_sync(io_ports->nsect_addr); } + words >>= 1; if (mmio) - __ide_mm_outsl((void __iomem *)data_addr, buf, len / 4); + __ide_mm_outsl((void __iomem *)data_addr, buf, words); else - outsl(data_addr, buf, len / 4); + outsl(data_addr, buf, words); if ((io_32bit & 2) && !mmio) local_irq_restore(flags); - if ((len & 3) >= 2) { - if (mmio) - __ide_mm_outsw((void __iomem *)data_addr, - (u8 *)buf + (len & ~3), 1); - else - outsw(data_addr, (u8 *)buf + (len & ~3), 1); - } - } else { - if (mmio) - __ide_mm_outsw((void __iomem *)data_addr, buf, len / 2); - else - outsw(data_addr, buf, len / 2); + if (((len + 1) & 3) < 2) + return; + + buf += len & ~3; + words = 1; } + + if (mmio) + __ide_mm_outsw((void __iomem *)data_addr, buf, words); + else + outsw(data_addr, buf, words); } EXPORT_SYMBOL_GPL(ide_output_data); From bac08cee93f9cb37b40ecfa8eaf1f6d8daf3909b Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:31 +0200 Subject: [PATCH 282/478] ide: call {in|out}put_data() methods from tf_{read|load}() methods (take 2) Handle IDE_FTFLAG_{IN|OUT}_DATA flags in tf_{read|load}() methods by calling {in|out}put_data() methods to transfer 2 bytes -- this will allow us to move that handling out of those methods altogether... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/at91_ide.c | 13 +++++++------ drivers/ide/ide-h8300.c | 15 ++++++++++----- drivers/ide/ide-io-std.c | 18 ++++++------------ drivers/ide/ns87415.c | 8 +++++--- drivers/ide/scc_pata.c | 16 ++++++++++------ drivers/ide/tx4938ide.c | 15 +++++++-------- drivers/ide/tx4939ide.c | 15 +++++++-------- 7 files changed, 52 insertions(+), 48 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index 9dce793d93b4..b7be66d600f8 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -196,9 +196,9 @@ static void at91_ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) HIHI = 0xFF; if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u16 data = (tf->hob_data << 8) | tf->data; + u8 data[2] = { tf->data, tf->hob_data }; - at91_ide_output_data(drive, NULL, &data, 2); + at91_ide_output_data(drive, cmd, data, 2); } if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) @@ -234,11 +234,12 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_taskfile *tf = &cmd->tf; if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u16 data; + u8 data[2]; - at91_ide_input_data(drive, NULL, &data, 2); - tf->data = data & 0xff; - tf->hob_data = (data >> 8) & 0xff; + at91_ide_input_data(drive, cmd, data, 2); + + tf->data = data[0]; + tf->hob_data = data[1]; } /* be sure we're looking at the low order bits */ diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c index 1d45cd5b6a1c..6cb1785d32e6 100644 --- a/drivers/ide/ide-h8300.c +++ b/drivers/ide/ide-h8300.c @@ -54,8 +54,11 @@ static void h8300_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) - mm_outw((tf->hob_data << 8) | tf->data, io_ports->data_addr); + if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { + u8 data[2] = { tf->data, tf->hob_data }; + + h8300_output_data(drive, cmd, data, 2); + } if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) outb(tf->hob_feature, io_ports->feature_addr); @@ -91,10 +94,12 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_taskfile *tf = &cmd->tf; if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u16 data = mm_inw(io_ports->data_addr); + u8 data[2]; - tf->data = data & 0xff; - tf->hob_data = (data >> 8) & 0xff; + h8300_input_data(drive, cmd, data, 2); + + tf->data = data[0]; + tf->hob_data = data[1]; } /* be sure we're looking at the low order bits */ diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 96a537da8925..f06940df255e 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -91,12 +91,9 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) HIHI = 0xFF; if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u16 data = (tf->hob_data << 8) | tf->data; + u8 data[2] = { tf->data, tf->hob_data }; - if (mmio) - writew(data, (void __iomem *)io_ports->data_addr); - else - outw(data, io_ports->data_addr); + ide_output_data(drive, cmd, data, 2); } if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) @@ -145,15 +142,12 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u16 data; + u8 data[2]; - if (mmio) - data = readw((void __iomem *)io_ports->data_addr); - else - data = inw(io_ports->data_addr); + ide_input_data(drive, cmd, data, 2); - tf->data = data & 0xff; - tf->hob_data = (data >> 8) & 0xff; + tf->data = data[0]; + tf->hob_data = data[1]; } /* be sure we're looking at the low order bits */ diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 0a6cf74c3265..7d64bc079fa8 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -67,10 +67,12 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_taskfile *tf = &cmd->tf; if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u16 data = inw(io_ports->data_addr); + u8 data[2]; - tf->data = data & 0xff; - tf->hob_data = (data >> 8) & 0xff; + ide_input_data(drive, cmd, data, 2); + + tf->data = data[0]; + tf->hob_data = data[1]; } /* be sure we're looking at the low order bits */ diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index ea0a9752c6f9..9c47cbf8c506 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -656,9 +656,11 @@ static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) - out_be32((void *)io_ports->data_addr, - (tf->hob_data << 8) | tf->data); + if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { + u8 data[2] = { tf->data, tf->hob_data }; + + scc_output_data(drive, NULL, data, 2); + } if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) scc_ide_outb(tf->hob_feature, io_ports->feature_addr); @@ -693,10 +695,12 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_taskfile *tf = &cmd->tf; if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u16 data = (u16)in_be32((void *)io_ports->data_addr); + u8 data[2]; - tf->data = data & 0xff; - tf->hob_data = (data >> 8) & 0xff; + scc_input_data(drive, cmd, data, 2); + + tf->data = data[0]; + tf->hob_data = data[1]; } /* be sure we're looking at the low order bits */ diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index 606c37f5267d..c075464308a0 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -93,10 +93,9 @@ static void tx4938ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) HIHI = 0xFF; if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u16 data = (tf->hob_data << 8) | tf->data; + u8 data[2] = { tf->data, tf->hob_data }; - /* no endian swap */ - __raw_writew(data, (void __iomem *)io_ports->data_addr); + hwif->tp_ops->output_data(drive, cmd, data, 2); } if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) @@ -133,12 +132,12 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_taskfile *tf = &cmd->tf; if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u16 data; + u8 data[2]; - /* no endian swap */ - data = __raw_readw((void __iomem *)io_ports->data_addr); - tf->data = data & 0xff; - tf->hob_data = (data >> 8) & 0xff; + hwif->tp_ops->input_data(drive, cmd, data, 2); + + tf->data = data[0]; + tf->hob_data = data[1]; } /* be sure we're looking at the low order bits */ diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index f1e9da71110c..c350d0c7ba4a 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -458,10 +458,9 @@ static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) HIHI = 0xFF; if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u16 data = (tf->hob_data << 8) | tf->data; + u8 data[2] = { tf->data, tf->hob_data }; - /* no endian swap */ - __raw_writew(data, (void __iomem *)io_ports->data_addr); + hwif->tp_ops->output_data(drive, cmd, data, 2); } if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) @@ -500,12 +499,12 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_taskfile *tf = &cmd->tf; if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u16 data; + u8 data[2]; - /* no endian swap */ - data = __raw_readw((void __iomem *)io_ports->data_addr); - tf->data = data & 0xff; - tf->hob_data = (data >> 8) & 0xff; + hwif->tp_ops->input_data(drive, cmd, data, 2); + + tf->data = data[0]; + tf->hob_data = data[1]; } /* be sure we're looking at the low order bits */ From 35218d1ca808ed19b8c6f079ce91872b3deb2219 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:31 +0200 Subject: [PATCH 283/478] ide: move data register access out of tf_{read|load}() methods (take 2) Move IDE_FTFLAG_{IN|OUT}_DATA flag handling out of tf_{read|load}() methods into the only two functions where these flags actually need to be handled: do_rw_taskfile() and ide_complete_cmd()... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/at91_ide.c | 15 --------------- drivers/ide/ide-h8300.c | 15 --------------- drivers/ide/ide-io-std.c | 15 --------------- drivers/ide/ide-io.c | 12 +++++++++++- drivers/ide/ide-taskfile.c | 6 ++++++ drivers/ide/ns87415.c | 9 --------- drivers/ide/scc_pata.c | 15 --------------- drivers/ide/tx4938ide.c | 15 --------------- drivers/ide/tx4939ide.c | 15 --------------- 9 files changed, 17 insertions(+), 100 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index b7be66d600f8..04b39ff02d76 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -195,12 +195,6 @@ static void at91_ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u8 data[2] = { tf->data, tf->hob_data }; - - at91_ide_output_data(drive, cmd, data, 2); - } - if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) ide_mm_outb(tf->hob_feature, io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT) @@ -233,15 +227,6 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_io_ports *io_ports = &hwif->io_ports; struct ide_taskfile *tf = &cmd->tf; - if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u8 data[2]; - - at91_ide_input_data(drive, cmd, data, 2); - - tf->data = data[0]; - tf->hob_data = data[1]; - } - /* be sure we're looking at the low order bits */ ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c index 6cb1785d32e6..8541a9abd7ac 100644 --- a/drivers/ide/ide-h8300.c +++ b/drivers/ide/ide-h8300.c @@ -54,12 +54,6 @@ static void h8300_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u8 data[2] = { tf->data, tf->hob_data }; - - h8300_output_data(drive, cmd, data, 2); - } - if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) outb(tf->hob_feature, io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT) @@ -93,15 +87,6 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_io_ports *io_ports = &hwif->io_ports; struct ide_taskfile *tf = &cmd->tf; - if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u8 data[2]; - - h8300_input_data(drive, cmd, data, 2); - - tf->data = data[0]; - tf->hob_data = data[1]; - } - /* be sure we're looking at the low order bits */ outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index f06940df255e..7f77bb7db488 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -90,12 +90,6 @@ void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u8 data[2] = { tf->data, tf->hob_data }; - - ide_output_data(drive, cmd, data, 2); - } - if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) tf_outb(tf->hob_feature, io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT) @@ -141,15 +135,6 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) tf_inb = ide_inb; } - if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u8 data[2]; - - ide_input_data(drive, cmd, data, 2); - - tf->data = data[0]; - tf->hob_data = data[1]; - } - /* be sure we're looking at the low order bits */ tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 5b57905a7d71..5589dce88674 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -73,6 +73,7 @@ EXPORT_SYMBOL_GPL(ide_end_rq); void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err) { + const struct ide_tp_ops *tp_ops = drive->hwif->tp_ops; struct ide_taskfile *tf = &cmd->tf; struct request *rq = cmd->rq; u8 tf_cmd = tf->command; @@ -80,7 +81,16 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err) tf->error = err; tf->status = stat; - drive->hwif->tp_ops->tf_read(drive, cmd); + if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { + u8 data[2]; + + tp_ops->input_data(drive, cmd, data, 2); + + tf->data = data[0]; + tf->hob_data = data[1]; + } + + tp_ops->tf_read(drive, cmd); if ((cmd->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) && tf_cmd == ATA_CMD_IDLEIMMEDIATE) { diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 47f13cd11031..243421ce40d0 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -82,6 +82,12 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd) ide_tf_dump(drive->name, tf); tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); SELECT_MASK(drive, 0); + + if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { + u8 data[2] = { tf->data, tf->hob_data }; + + tp_ops->output_data(drive, cmd, data, 2); + } tp_ops->tf_load(drive, cmd); } diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 7d64bc079fa8..9f6dff83b141 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -66,15 +66,6 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_io_ports *io_ports = &drive->hwif->io_ports; struct ide_taskfile *tf = &cmd->tf; - if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u8 data[2]; - - ide_input_data(drive, cmd, data, 2); - - tf->data = data[0]; - tf->hob_data = data[1]; - } - /* be sure we're looking at the low order bits */ outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 9c47cbf8c506..97f8e0ef21b1 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -656,12 +656,6 @@ static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u8 data[2] = { tf->data, tf->hob_data }; - - scc_output_data(drive, NULL, data, 2); - } - if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) scc_ide_outb(tf->hob_feature, io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT) @@ -694,15 +688,6 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_io_ports *io_ports = &drive->hwif->io_ports; struct ide_taskfile *tf = &cmd->tf; - if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u8 data[2]; - - scc_input_data(drive, cmd, data, 2); - - tf->data = data[0]; - tf->hob_data = data[1]; - } - /* be sure we're looking at the low order bits */ scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index c075464308a0..be391b615963 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -92,12 +92,6 @@ static void tx4938ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u8 data[2] = { tf->data, tf->hob_data }; - - hwif->tp_ops->output_data(drive, cmd, data, 2); - } - if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) tx4938ide_outb(tf->hob_feature, io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT) @@ -131,15 +125,6 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_io_ports *io_ports = &hwif->io_ports; struct ide_taskfile *tf = &cmd->tf; - if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u8 data[2]; - - hwif->tp_ops->input_data(drive, cmd, data, 2); - - tf->data = data[0]; - tf->hob_data = data[1]; - } - /* be sure we're looking at the low order bits */ tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index c350d0c7ba4a..5a614d1c94f1 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -457,12 +457,6 @@ static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED) HIHI = 0xFF; - if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) { - u8 data[2] = { tf->data, tf->hob_data }; - - hwif->tp_ops->output_data(drive, cmd, data, 2); - } - if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) tx4939ide_outb(tf->hob_feature, io_ports->feature_addr); if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT) @@ -498,15 +492,6 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) struct ide_io_ports *io_ports = &hwif->io_ports; struct ide_taskfile *tf = &cmd->tf; - if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) { - u8 data[2]; - - hwif->tp_ops->input_data(drive, cmd, data, 2); - - tf->data = data[0]; - tf->hob_data = data[1]; - } - /* be sure we're looking at the low order bits */ tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr); From 0f861e8c47ede537a8ad280c61d5d00d541f04db Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2009 20:15:31 +0200 Subject: [PATCH 284/478] MAINTAINERS: move old ide-{floppy,tape} entries to CREDITS (take 2) Ben Hutchings noticed that MAINTAINERS somehow still contain old ide-{floppy,tape} entries. Fix it by moving them to CREDITS (kudos to Gadi and Paul for all the early hard work on ide-{floppy,tape}). v2: Rename IDE/ATAPI CDROM DRIVER entry to IDE/ATAPI DRIVERS one. Cc: Gadi Oxman Cc: Paul Bristow Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- CREDITS | 9 +++++++++ MAINTAINERS | 15 +-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/CREDITS b/CREDITS index e8b7d36611e5..9a93e3e26d70 100644 --- a/CREDITS +++ b/CREDITS @@ -495,6 +495,11 @@ S: Kopmansg 2 S: 411 13 Goteborg S: Sweden +N: Paul Bristow +E: paul@paulbristow.net +W: http://paulbristow.net/linux/idefloppy.html +D: Maintainer of IDE/ATAPI floppy driver + N: Dominik Brodowski E: linux@brodo.de W: http://www.brodo.de/ @@ -2642,6 +2647,10 @@ S: C/ Mieses 20, 9-B S: Valladolid 47009 S: Spain +N: Gadi Oxman +E: gadio@netvision.net.il +D: Original author and maintainer of IDE/ATAPI floppy/tape drivers + N: Greg Page E: gpage@sovereign.org D: IPX development and support diff --git a/MAINTAINERS b/MAINTAINERS index c5f4e9d27b64..d20c786baf2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2201,25 +2201,12 @@ L: linux-ide@vger.kernel.org T: quilt kernel.org/pub/linux/kernel/people/bart/pata-2.6/ S: Maintained -IDE/ATAPI CDROM DRIVER +IDE/ATAPI DRIVERS P: Borislav Petkov M: petkovbb@gmail.com L: linux-ide@vger.kernel.org S: Maintained -IDE/ATAPI FLOPPY DRIVERS -P: Paul Bristow -M: Paul Bristow -W: http://paulbristow.net/linux/idefloppy.html -L: linux-kernel@vger.kernel.org -S: Maintained - -IDE/ATAPI TAPE DRIVERS -P: Gadi Oxman -M: Gadi Oxman -L: linux-kernel@vger.kernel.org -S: Maintained - IDLE-I7300 P: Andy Henroid M: andrew.d.henroid@intel.com From abb596b25edac1ec1acc4ef53df190771661c3d2 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:32 +0200 Subject: [PATCH 285/478] ide: turn selectproc() method into dev_select() method (take 5) Turn selectproc() method into dev_select() method by teaching it to write to the device register and moving it from 'struct ide_port_ops' to 'struct ide_tp_ops'. Signed-off-by: Sergei Shtylyov Cc: benh@kernel.crashing.org Cc: petkovbb@gmail.com [bart: add ->dev_select to at91_ide.c and tx4939.c (__BIG_ENDIAN case)] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/at91_ide.c | 1 + drivers/ide/au1xxx-ide.c | 1 + drivers/ide/falconide.c | 1 + drivers/ide/ht6560b.c | 20 ++++++++++++-- drivers/ide/ide-h8300.c | 1 + drivers/ide/ide-io-std.c | 13 +++++++++ drivers/ide/ide-iops.c | 12 +-------- drivers/ide/ns87415.c | 25 +++++++++++++---- drivers/ide/pmac.c | 58 +++++++++++++++++++++++++++------------- drivers/ide/q40ide.c | 1 + drivers/ide/qd65xx.c | 21 ++++++++++++--- drivers/ide/scc_pata.c | 1 + drivers/ide/sgiioc4.c | 1 + drivers/ide/trm290.c | 20 +++++++++++--- drivers/ide/tx4938ide.c | 1 + drivers/ide/tx4939ide.c | 4 ++- include/linux/ide.h | 6 ++--- 17 files changed, 139 insertions(+), 48 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index 04b39ff02d76..8eda552326e9 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -283,6 +283,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = at91_ide_tf_load, .tf_read = at91_ide_tf_read, diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index 2ca10d533dad..46013644c965 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -469,6 +469,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c index 5063be85dc33..afa2af9a362b 100644 --- a/drivers/ide/falconide.c +++ b/drivers/ide/falconide.c @@ -91,6 +91,7 @@ static const struct ide_tp_ops falconide_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c index c7e5c2246b79..2fb0f2965009 100644 --- a/drivers/ide/ht6560b.c +++ b/drivers/ide/ht6560b.c @@ -103,7 +103,7 @@ /* * This routine is invoked from ide.c to prepare for access to a given drive. */ -static void ht6560b_selectproc (ide_drive_t *drive) +static void ht6560b_dev_select(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; unsigned long flags; @@ -143,6 +143,8 @@ static void ht6560b_selectproc (ide_drive_t *drive) #endif } local_irq_restore(flags); + + outb(drive->select | ATA_DEVICE_OBS, hwif->io_ports.device_addr); } /* @@ -305,15 +307,29 @@ static int probe_ht6560b; module_param_named(probe, probe_ht6560b, bool, 0); MODULE_PARM_DESC(probe, "probe for HT6560B chipset"); +static const struct ide_tp_ops ht6560b_tp_ops = { + .exec_command = ide_exec_command, + .read_status = ide_read_status, + .read_altstatus = ide_read_altstatus, + .write_devctl = ide_write_devctl, + + .dev_select = ht6560b_dev_select, + .tf_load = ide_tf_load, + .tf_read = ide_tf_read, + + .input_data = ide_input_data, + .output_data = ide_output_data, +}; + static const struct ide_port_ops ht6560b_port_ops = { .init_dev = ht6560b_init_dev, .set_pio_mode = ht6560b_set_pio_mode, - .selectproc = ht6560b_selectproc, }; static const struct ide_port_info ht6560b_port_info __initdata = { .name = DRV_NAME, .chipset = ide_ht6560b, + .tp_ops = &ht6560b_tp_ops, .port_ops = &ht6560b_port_ops, .host_flags = IDE_HFLAG_SERIALIZE | /* is this needed? */ IDE_HFLAG_NO_DMA | diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c index 8541a9abd7ac..dac9a6d44963 100644 --- a/drivers/ide/ide-h8300.c +++ b/drivers/ide/ide-h8300.c @@ -151,6 +151,7 @@ static const struct ide_tp_ops h8300_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = h8300_tf_load, .tf_read = h8300_tf_read, diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 7f77bb7db488..9cac281d82c4 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -73,6 +73,18 @@ void ide_write_devctl(ide_hwif_t *hwif, u8 ctl) } EXPORT_SYMBOL_GPL(ide_write_devctl); +void ide_dev_select(ide_drive_t *drive) +{ + ide_hwif_t *hwif = drive->hwif; + u8 select = drive->select | ATA_DEVICE_OBS; + + if (hwif->host_flags & IDE_HFLAG_MMIO) + writeb(select, (void __iomem *)hwif->io_ports.device_addr); + else + outb(select, hwif->io_ports.device_addr); +} +EXPORT_SYMBOL_GPL(ide_dev_select); + void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; @@ -280,6 +292,7 @@ const struct ide_tp_ops default_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 6f363a26700d..dfb0ec317fa3 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -29,17 +29,7 @@ void SELECT_DRIVE(ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; - const struct ide_port_ops *port_ops = hwif->port_ops; - struct ide_cmd cmd; - - if (port_ops && port_ops->selectproc) - port_ops->selectproc(drive); - - memset(&cmd, 0, sizeof(cmd)); - cmd.tf_flags = IDE_TFLAG_OUT_DEVICE; - - drive->hwif->tp_ops->tf_load(drive, &cmd); + drive->hwif->tp_ops->dev_select(drive); } void SELECT_MASK(ide_drive_t *drive, int mask) diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index 9f6dff83b141..af1b421eb450 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -98,12 +98,15 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd) } } +static void ns87415_dev_select(ide_drive_t *drive); + static const struct ide_tp_ops superio_tp_ops = { .exec_command = ide_exec_command, .read_status = superio_read_status, .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ns87415_dev_select, .tf_load = ide_tf_load, .tf_read = superio_tf_read, @@ -182,10 +185,12 @@ static void ns87415_prepare_drive (ide_drive_t *drive, unsigned int use_dma) local_irq_restore(flags); } -static void ns87415_selectproc (ide_drive_t *drive) +static void ns87415_dev_select(ide_drive_t *drive) { ns87415_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA)); + + outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr); } static void ns87415_dma_start(ide_drive_t *drive) @@ -229,7 +234,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif) * Also, leave IRQ masked during drive probing, to prevent infinite * interrupts from a potentially floating INTA.. * - * IRQs get unmasked in selectproc when drive is first used. + * IRQs get unmasked in dev_select() when drive is first used. */ (void) pci_read_config_dword(dev, 0x40, &ctrl); (void) pci_read_config_byte(dev, 0x09, &progif); @@ -281,8 +286,18 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif) outb(0x60, hwif->dma_base + ATA_DMA_STATUS); } -static const struct ide_port_ops ns87415_port_ops = { - .selectproc = ns87415_selectproc, +static const struct ide_tp_ops ns87415_tp_ops = { + .exec_command = ide_exec_command, + .read_status = ide_read_status, + .read_altstatus = ide_read_altstatus, + .write_devctl = ide_write_devctl, + + .dev_select = ns87415_dev_select, + .tf_load = ide_tf_load, + .tf_read = ide_tf_read, + + .input_data = ide_input_data, + .output_data = ide_output_data, }; static const struct ide_dma_ops ns87415_dma_ops = { @@ -299,7 +314,7 @@ static const struct ide_dma_ops ns87415_dma_ops = { static const struct ide_port_info ns87415_chipset __devinitdata = { .name = DRV_NAME, .init_hwif = init_hwif_ns87415, - .port_ops = &ns87415_port_ops, + .tp_ops = &ns87415_tp_ops, .dma_ops = &ns87415_dma_ops, .host_flags = IDE_HFLAG_TRUST_BIOS_FOR_DMA | IDE_HFLAG_NO_ATAPI_DMA, diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 7aa45ea37eeb..24ce1f805cd7 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -404,8 +404,6 @@ kauai_lookup_timing(struct kauai_timing* table, int cycle_time) #define IDE_WAKEUP_DELAY (1*HZ) static int pmac_ide_init_dma(ide_hwif_t *, const struct ide_port_info *); -static void pmac_ide_selectproc(ide_drive_t *drive); -static void pmac_ide_kauai_selectproc(ide_drive_t *drive); #define PMAC_IDE_REG(x) \ ((void __iomem *)((drive)->hwif->io_ports.data_addr + (x))) @@ -415,8 +413,7 @@ static void pmac_ide_kauai_selectproc(ide_drive_t *drive); * timing register when selecting that unit. This version is for * ASICs with a single timing register */ -static void -pmac_ide_selectproc(ide_drive_t *drive) +static void pmac_ide_apply_timings(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; pmac_ide_hwif_t *pmif = @@ -434,8 +431,7 @@ pmac_ide_selectproc(ide_drive_t *drive) * timing register when selecting that unit. This version is for * ASICs with a dual timing register (Kauai) */ -static void -pmac_ide_kauai_selectproc(ide_drive_t *drive) +static void pmac_ide_kauai_apply_timings(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; pmac_ide_hwif_t *pmif = @@ -464,9 +460,25 @@ pmac_ide_do_update_timings(ide_drive_t *drive) if (pmif->kind == controller_sh_ata6 || pmif->kind == controller_un_ata6 || pmif->kind == controller_k2_ata6) - pmac_ide_kauai_selectproc(drive); + pmac_ide_kauai_apply_timings(drive); else - pmac_ide_selectproc(drive); + pmac_ide_apply_timings(drive); +} + +static void pmac_dev_select(ide_drive_t *drive) +{ + pmac_ide_apply_timings(drive); + + writeb(drive->select | ATA_DEVICE_OBS, + (void __iomem *)drive->hwif->io_ports.device_addr); +} + +static void pmac_kauai_dev_select(ide_drive_t *drive) +{ + pmac_ide_kauai_apply_timings(drive); + + writeb(drive->select | ATA_DEVICE_OBS, + (void __iomem *)drive->hwif->io_ports.device_addr); } static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd) @@ -947,6 +959,7 @@ static const struct ide_tp_ops pmac_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = pmac_write_devctl, + .dev_select = pmac_dev_select, .tf_load = ide_tf_load, .tf_read = ide_tf_read, @@ -954,19 +967,24 @@ static const struct ide_tp_ops pmac_tp_ops = { .output_data = ide_output_data, }; -static const struct ide_port_ops pmac_ide_ata6_port_ops = { - .init_dev = pmac_ide_init_dev, - .set_pio_mode = pmac_ide_set_pio_mode, - .set_dma_mode = pmac_ide_set_dma_mode, - .selectproc = pmac_ide_kauai_selectproc, - .cable_detect = pmac_ide_cable_detect, +static const struct ide_tp_ops pmac_ata6_tp_ops = { + .exec_command = pmac_exec_command, + .read_status = ide_read_status, + .read_altstatus = ide_read_altstatus, + .write_devctl = pmac_write_devctl, + + .dev_select = pmac_kauai_dev_select, + .tf_load = ide_tf_load, + .tf_read = ide_tf_read, + + .input_data = ide_input_data, + .output_data = ide_output_data, }; static const struct ide_port_ops pmac_ide_ata4_port_ops = { .init_dev = pmac_ide_init_dev, .set_pio_mode = pmac_ide_set_pio_mode, .set_dma_mode = pmac_ide_set_dma_mode, - .selectproc = pmac_ide_selectproc, .cable_detect = pmac_ide_cable_detect, }; @@ -974,7 +992,6 @@ static const struct ide_port_ops pmac_ide_port_ops = { .init_dev = pmac_ide_init_dev, .set_pio_mode = pmac_ide_set_pio_mode, .set_dma_mode = pmac_ide_set_dma_mode, - .selectproc = pmac_ide_selectproc, }; static const struct ide_dma_ops pmac_dma_ops; @@ -1011,15 +1028,18 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw) pmif->broken_dma = pmif->broken_dma_warn = 0; if (of_device_is_compatible(np, "shasta-ata")) { pmif->kind = controller_sh_ata6; - d.port_ops = &pmac_ide_ata6_port_ops; + d.tp_ops = &pmac_ata6_tp_ops; + d.port_ops = &pmac_ide_ata4_port_ops; d.udma_mask = ATA_UDMA6; } else if (of_device_is_compatible(np, "kauai-ata")) { pmif->kind = controller_un_ata6; - d.port_ops = &pmac_ide_ata6_port_ops; + d.tp_ops = &pmac_ata6_tp_ops; + d.port_ops = &pmac_ide_ata4_port_ops; d.udma_mask = ATA_UDMA5; } else if (of_device_is_compatible(np, "K2-UATA")) { pmif->kind = controller_k2_ata6; - d.port_ops = &pmac_ide_ata6_port_ops; + d.tp_ops = &pmac_ata6_tp_ops; + d.port_ops = &pmac_ide_ata4_port_ops; d.udma_mask = ATA_UDMA5; } else if (of_device_is_compatible(np, "keylargo-ata")) { if (strcmp(np->name, "ata-4") == 0) { diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c index 7fddfd34fcce..d007e7f66598 100644 --- a/drivers/ide/q40ide.c +++ b/drivers/ide/q40ide.c @@ -101,6 +101,7 @@ static const struct ide_tp_ops q40ide_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c index 08c4fa35e9b1..c9a134986891 100644 --- a/drivers/ide/qd65xx.c +++ b/drivers/ide/qd65xx.c @@ -90,13 +90,15 @@ static int timings[4]={-1,-1,-1,-1}; /* stores current timing for each timer */ * This routine is invoked to prepare for access to a given drive. */ -static void qd65xx_select(ide_drive_t *drive) +static void qd65xx_dev_select(ide_drive_t *drive) { u8 index = (( (QD_TIMREG(drive)) & 0x80 ) >> 7) | (QD_TIMREG(drive) & 0x02); if (timings[index] != QD_TIMING(drive)) outb(timings[index] = QD_TIMING(drive), QD_TIMREG(drive)); + + outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr); } /* @@ -309,20 +311,33 @@ static void __init qd6580_init_dev(ide_drive_t *drive) drive->drive_data = (drive->dn & 1) ? t2 : t1; } +static const struct ide_tp_ops qd65xx_tp_ops = { + .exec_command = ide_exec_command, + .read_status = ide_read_status, + .read_altstatus = ide_read_altstatus, + .write_devctl = ide_write_devctl, + + .dev_select = qd65xx_dev_select, + .tf_load = ide_tf_load, + .tf_read = ide_tf_read, + + .input_data = ide_input_data, + .output_data = ide_output_data, +}; + static const struct ide_port_ops qd6500_port_ops = { .init_dev = qd6500_init_dev, .set_pio_mode = qd6500_set_pio_mode, - .selectproc = qd65xx_select, }; static const struct ide_port_ops qd6580_port_ops = { .init_dev = qd6580_init_dev, .set_pio_mode = qd6580_set_pio_mode, - .selectproc = qd65xx_select, }; static const struct ide_port_info qd65xx_port_info __initdata = { .name = DRV_NAME, + .tp_ops = &qd65xx_tp_ops, .chipset = ide_qd65xx, .host_flags = IDE_HFLAG_IO_32BIT | IDE_HFLAG_NO_DMA, diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 97f8e0ef21b1..6d8dbd9c10bc 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -825,6 +825,7 @@ static const struct ide_tp_ops scc_tp_ops = { .read_altstatus = scc_read_altstatus, .write_devctl = scc_write_devctl, + .dev_select = ide_dev_select, .tf_load = scc_tf_load, .tf_read = scc_tf_read, diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index 58980fcafc3b..e5d2a48a84de 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -505,6 +505,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = ide_tf_load, .tf_read = ide_tf_read, diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c index c0528f27fcae..4b42ca091534 100644 --- a/drivers/ide/trm290.c +++ b/drivers/ide/trm290.c @@ -171,9 +171,11 @@ static void trm290_prepare_drive (ide_drive_t *drive, unsigned int use_dma) local_irq_restore(flags); } -static void trm290_selectproc (ide_drive_t *drive) +static void trm290_dev_select(ide_drive_t *drive) { trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA)); + + outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr); } static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd) @@ -298,8 +300,18 @@ static void __devinit init_hwif_trm290(ide_hwif_t *hwif) #endif } -static const struct ide_port_ops trm290_port_ops = { - .selectproc = trm290_selectproc, +static const struct ide_tp_ops trm290_tp_ops = { + .exec_command = ide_exec_command, + .read_status = ide_read_status, + .read_altstatus = ide_read_altstatus, + .write_devctl = ide_write_devctl, + + .dev_select = trm290_dev_select, + .tf_load = ide_tf_load, + .tf_read = ide_tf_read, + + .input_data = ide_input_data, + .output_data = ide_output_data, }; static struct ide_dma_ops trm290_dma_ops = { @@ -315,7 +327,7 @@ static struct ide_dma_ops trm290_dma_ops = { static const struct ide_port_info trm290_chipset __devinitdata = { .name = DRV_NAME, .init_hwif = init_hwif_trm290, - .port_ops = &trm290_port_ops, + .tp_ops = &trm290_tp_ops, .dma_ops = &trm290_dma_ops, .host_flags = IDE_HFLAG_TRM290 | IDE_HFLAG_NO_ATAPI_DMA | diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index be391b615963..4cb79c4c2604 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -189,6 +189,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = tx4938ide_tf_load, .tf_read = tx4938ide_tf_read, diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 5a614d1c94f1..0040a9a3e26e 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -429,7 +429,7 @@ static void tx4939ide_tf_load_fixup(ide_drive_t *drive) * Fix ATA100 CORE System Control Register. (The write to the * Device/Head register may write wrong data to the System * Control Register) - * While Sys_Ctl is written here, selectproc is not needed. + * While Sys_Ctl is written here, dev_select() is not needed. */ tx4939ide_writew(sysctl, base, TX4939IDE_Sys_Ctl); } @@ -556,6 +556,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = tx4939ide_tf_load, .tf_read = tx4939ide_tf_read, @@ -579,6 +580,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = { .read_altstatus = ide_read_altstatus, .write_devctl = ide_write_devctl, + .dev_select = ide_dev_select, .tf_load = tx4939ide_tf_load, .tf_read = ide_tf_read, diff --git a/include/linux/ide.h b/include/linux/ide.h index e919c865f0c7..c69181c61fd8 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -603,7 +603,7 @@ struct ide_drive_s { unsigned int bios_cyl; /* BIOS/fdisk/LILO number of cyls */ unsigned int cyl; /* "real" number of cyls */ - unsigned int drive_data; /* used by set_pio_mode/selectproc */ + unsigned int drive_data; /* used by set_pio_mode/dev_select() */ unsigned int failures; /* current failure count */ unsigned int max_failures; /* maximum allowed failure count */ u64 probed_capacity;/* initial reported media capacity (ide-cd only currently) */ @@ -661,6 +661,7 @@ struct ide_tp_ops { u8 (*read_altstatus)(struct hwif_s *); void (*write_devctl)(struct hwif_s *, u8); + void (*dev_select)(ide_drive_t *); void (*tf_load)(ide_drive_t *, struct ide_cmd *); void (*tf_read)(ide_drive_t *, struct ide_cmd *); @@ -678,7 +679,6 @@ extern const struct ide_tp_ops default_tp_ops; * @init_dev: host specific initialization of a device * @set_pio_mode: routine to program host for PIO mode * @set_dma_mode: routine to program host for DMA mode - * @selectproc: tweaks hardware to select drive * @reset_poll: chipset polling based on hba specifics * @pre_reset: chipset specific changes to default for device-hba resets * @resetproc: routine to reset controller after a disk reset @@ -695,7 +695,6 @@ struct ide_port_ops { void (*init_dev)(ide_drive_t *); void (*set_pio_mode)(ide_drive_t *, const u8); void (*set_dma_mode)(ide_drive_t *, const u8); - void (*selectproc)(ide_drive_t *); int (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); void (*resetproc)(ide_drive_t *); @@ -1170,6 +1169,7 @@ u8 ide_read_status(ide_hwif_t *); u8 ide_read_altstatus(ide_hwif_t *); void ide_write_devctl(ide_hwif_t *, u8); +void ide_dev_select(ide_drive_t *); void ide_tf_load(ide_drive_t *, struct ide_cmd *); void ide_tf_read(ide_drive_t *, struct ide_cmd *); From fdd88f0af616db59a6a36bdf0185181d2b779f53 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 31 Mar 2009 20:15:33 +0200 Subject: [PATCH 286/478] ide: inline SELECT_DRIVE() Since SELECT_DRIVE() has boiled down to a mere dev_select() method call, it now makes sense to just inline it... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-eh.c | 7 ++++--- drivers/ide/ide-io.c | 2 +- drivers/ide/ide-iops.c | 7 +------ drivers/ide/ide-pm.c | 5 +++-- drivers/ide/ide-probe.c | 15 ++++++++------- drivers/ide/ns87415.c | 2 +- include/linux/ide.h | 1 - 7 files changed, 18 insertions(+), 21 deletions(-) diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index de4b7f1c9c9f..5d5fb961b5ce 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -165,11 +165,12 @@ static ide_startstop_t do_reset1(ide_drive_t *, int); static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; + const struct ide_tp_ops *tp_ops = hwif->tp_ops; u8 stat; - SELECT_DRIVE(drive); + tp_ops->dev_select(drive); udelay(10); - stat = hwif->tp_ops->read_status(hwif); + stat = tp_ops->read_status(hwif); if (OK_STAT(stat, 0, ATA_BUSY)) printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name); @@ -348,7 +349,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi) /* For an ATAPI device, first try an ATAPI SRST. */ if (drive->media != ide_disk && !do_not_try_atapi) { pre_reset(drive); - SELECT_DRIVE(drive); + tp_ops->dev_select(drive); udelay(20); tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET); ndelay(400); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 5589dce88674..1deb6d29b186 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -348,7 +348,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) if (blk_pm_request(rq)) ide_check_pm_state(drive, rq); - SELECT_DRIVE(drive); + drive->hwif->tp_ops->dev_select(drive); if (ide_wait_stat(&startstop, drive, drive->ready_stat, ATA_BUSY | ATA_DRQ, WAIT_READY)) { printk(KERN_ERR "%s: drive not ready for command\n", drive->name); diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index dfb0ec317fa3..27bb70ddd459 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -27,11 +27,6 @@ #include #include -void SELECT_DRIVE(ide_drive_t *drive) -{ - drive->hwif->tp_ops->dev_select(drive); -} - void SELECT_MASK(ide_drive_t *drive, int mask) { const struct ide_port_ops *port_ops = drive->hwif->port_ops; @@ -347,7 +342,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) disable_irq_nosync(hwif->irq); udelay(1); - SELECT_DRIVE(drive); + tp_ops->dev_select(drive); SELECT_MASK(drive, 1); udelay(1); tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS); diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 20553d4c42a2..bb7858ebb7d1 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -223,6 +223,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq) * point. */ ide_hwif_t *hwif = drive->hwif; + const struct ide_tp_ops *tp_ops = hwif->tp_ops; struct request_queue *q = drive->queue; unsigned long flags; int rc; @@ -232,8 +233,8 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq) rc = ide_wait_not_busy(hwif, 35000); if (rc) printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name); - SELECT_DRIVE(drive); - hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); + tp_ops->dev_select(drive); + tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); rc = ide_wait_not_busy(hwif, 100000); if (rc) printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name); diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index d240f76b0da6..d8c1c3e735bb 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -390,13 +390,13 @@ static int do_probe (ide_drive_t *drive, u8 cmd) * (e.g. crw9624 as drive0 with disk as slave) */ msleep(50); - SELECT_DRIVE(drive); + tp_ops->dev_select(drive); msleep(50); if (ide_read_device(drive) != drive->select && present == 0) { if (drive->dn & 1) { /* exit with drive0 selected */ - SELECT_DRIVE(hwif->devices[0]); + tp_ops->dev_select(hwif->devices[0]); /* allow ATA_BUSY to assert & clear */ msleep(50); } @@ -422,7 +422,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd) printk(KERN_ERR "%s: no response (status = 0x%02x), " "resetting drive\n", drive->name, stat); msleep(50); - SELECT_DRIVE(drive); + tp_ops->dev_select(drive); msleep(50); tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET); (void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0); @@ -441,7 +441,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd) } if (drive->dn & 1) { /* exit with drive0 selected */ - SELECT_DRIVE(hwif->devices[0]); + tp_ops->dev_select(hwif->devices[0]); msleep(50); /* ensure drive irq is clear */ (void)tp_ops->read_status(hwif); @@ -605,6 +605,7 @@ out: static int ide_port_wait_ready(ide_hwif_t *hwif) { + const struct ide_tp_ops *tp_ops = hwif->tp_ops; ide_drive_t *drive; int i, rc; @@ -627,8 +628,8 @@ static int ide_port_wait_ready(ide_hwif_t *hwif) /* Ignore disks that we will not probe for later. */ if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 || (drive->dev_flags & IDE_DFLAG_PRESENT)) { - SELECT_DRIVE(drive); - hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); + tp_ops->dev_select(drive); + tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS); mdelay(2); rc = ide_wait_not_busy(hwif, 35000); if (rc) @@ -640,7 +641,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif) out: /* Exit function with master reselected (let's be sane) */ if (i) - SELECT_DRIVE(hwif->devices[0]); + tp_ops->dev_select(hwif->devices[0]); return rc; } diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c index af1b421eb450..71a39fb3856f 100644 --- a/drivers/ide/ns87415.c +++ b/drivers/ide/ns87415.c @@ -262,7 +262,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif) #ifdef __sparc_v9__ /* * XXX: Reset the device, if we don't it will not respond to - * SELECT_DRIVE() properly during first ide_probe_port(). + * dev_select() properly during first ide_probe_port(). */ timeout = 10000; outb(12, hwif->io_ports.ctl_addr); diff --git a/include/linux/ide.h b/include/linux/ide.h index c69181c61fd8..a5d26f66ef78 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1176,7 +1176,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *); void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int); void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int); -extern void SELECT_DRIVE(ide_drive_t *); void SELECT_MASK(ide_drive_t *, int); u8 ide_read_error(ide_drive_t *); From a9d5a97fa3828e7cbc577805eba3d0a0d35dd5a0 Mon Sep 17 00:00:00 2001 From: TOMARI Hisanobu Date: Tue, 31 Mar 2009 20:15:34 +0200 Subject: [PATCH 287/478] ide-pmac: IDE cable detection on Apple PowerBook As IDE cable used on Apple PowerBook/iBook laptops are always of "Short 40" type when the firmware says it's 80 conductor one, the cable detection should return ATA_CBL_PATA40_SHORT on those machines. This enables to automatically use UDMA5 even with drives that doesn't correctly detect those cables on Apple laptops. Signed-off-by: TOMARI Hisanobu Cc: Sergei Shtylyov Cc: benh@kernel.crashing.org [bart: beautify patch description] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/pmac.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 24ce1f805cd7..052b9bf1f8fb 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -919,10 +919,18 @@ static u8 pmac_ide_cable_detect(ide_hwif_t *hwif) (pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent); struct device_node *np = pmif->node; const char *cable = of_get_property(np, "cable-type", NULL); + struct device_node *root = of_find_node_by_path("/"); + const char *model = of_get_property(root, "model", NULL); /* Get cable type from device-tree. */ - if (cable && !strncmp(cable, "80-", 3)) - return ATA_CBL_PATA80; + if (cable && !strncmp(cable, "80-", 3)) { + /* Some drives fail to detect 80c cable in PowerBook */ + /* These machine use proprietary short IDE cable anyway */ + if (!strncmp(model, "PowerBook", 9)) + return ATA_CBL_PATA40_SHORT; + else + return ATA_CBL_PATA80; + } /* * G5's seem to have incorrect cable type in device-tree. From d80c592c38378c88c568b96963f7a98d927d05fa Mon Sep 17 00:00:00 2001 From: Gilles Espinasse Date: Tue, 31 Mar 2009 20:15:34 +0200 Subject: [PATCH 288/478] ide: be able to build pmac driver without IDE built-in No reason to need IDE built-in to be able to compile pmac driver. Tested to work on 2.6.29-rc8 and 2.6.28.8 with ide and pmac as modules inside an initramfs. Signed-off-by: Gilles Espinasse Cc: sam@ravnborg.org Cc: benh@kernel.crashing.org [bart: remove now superfluous IDE check] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 4d2f2a6b5bdb..cf06494bb744 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -681,7 +681,7 @@ endif # TODO: BLK_DEV_IDEDMA_PCI -> BLK_DEV_IDEDMA_SFF config BLK_DEV_IDE_PMAC tristate "PowerMac on-board IDE support" - depends on PPC_PMAC && IDE=y + depends on PPC_PMAC select IDE_TIMINGS select BLK_DEV_IDEDMA_PCI help From 5b6c942dd1f13835eff8105ec2aa859544a1498d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 Mar 2009 20:15:34 +0200 Subject: [PATCH 289/478] ide-floppy: do not complete rq's prematurely ... and access them afterwards. Simplify rq completing code while at it. Spotted-by: Tejun Heo Signed-off-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 100e6f94b4f0..3e43b889dd64 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -310,16 +310,14 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) pc->xferred = pc->req_xfer; if (drive->pc_update_buffers) drive->pc_update_buffers(drive, pc); - - if (drive->media == ide_floppy) - ide_complete_rq(drive, 0, blk_rq_bytes(rq)); } debug_log("%s: DMA finished\n", drive->name); } /* No more interrupts */ if ((stat & ATA_DRQ) == 0) { - int uptodate; + int uptodate, error; + unsigned int done; debug_log("Packet command completed, %d bytes transferred\n", pc->xferred); @@ -366,9 +364,9 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if (blk_special_request(rq)) { rq->errors = 0; - ide_complete_rq(drive, 0, blk_rq_bytes(rq)); + done = blk_rq_bytes(rq); + error = 0; } else { - unsigned int done; if (blk_fs_request(rq) == 0 && uptodate <= 0) { if (rq->errors == 0) @@ -380,9 +378,10 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) else done = blk_rq_bytes(rq); - ide_complete_rq(drive, uptodate ? 0 : -EIO, done); + error = uptodate ? 0 : -EIO; } + ide_complete_rq(drive, error, done); return ide_stopped; } From 5a3f23d515a2ebf0c750db80579ca57b28cbce6d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 31 Mar 2009 13:27:11 -0400 Subject: [PATCH 290/478] Btrfs: add extra flushing for renames and truncates Renames and truncates are both common ways to replace old data with new data. The filesystem can make an effort to make sure the new data is on disk before actually replacing the old data. This is especially important for rename, which many application use as though it were atomic for both the data and the metadata involved. The current btrfs code will happily replace a file that is fully on disk with one that was just created and still has pending IO. If we crash after transaction commit but before the IO is done, we'll end up replacing a good file with a zero length file. The solution used here is to create a list of inodes that need special ordering and force them to disk before the commit is done. This is similar to the ext3 style data=ordering, except it is only done on selected files. Btrfs is able to get away with this because it does not wait on commits very often, even for fsync (which use a sub-commit). For renames, we order the file when it wasn't already on disk and when it is replacing an existing file. Larger files are sent to filemap_flush right away (before the transaction handle is opened). For truncates, we order if the file goes from non-zero size down to zero size. This is a little different, because at the time of the truncate the file has no dirty bytes to order. But, we flag the inode so that it is added to the ordered list on close (via release method). We also immediately add it to the ordered list of the current transaction so that we can try to flush down any writes the application sneaks in before commit. Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 18 ++++++ fs/btrfs/ctree.h | 35 ++++++++++++ fs/btrfs/disk-io.c | 2 + fs/btrfs/file.c | 26 +++++++++ fs/btrfs/inode.c | 81 ++++++++++++++++++++++++--- fs/btrfs/ordered-data.c | 118 ++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.h | 4 ++ fs/btrfs/transaction.c | 11 ++++ 8 files changed, 288 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3af4cfb5654c..b30986f00b9d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,6 +66,12 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* + * list for tracking inodes that must be sent to disk before a + * rename or truncate commit + */ + struct list_head ordered_operations; + /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -122,6 +128,18 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + * + * yes, its silly to have a single bitflag, but we might grow more + * of these. + */ + unsigned ordered_data_close:1; + struct inode vfs_inode; }; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2737facbd341..f48905ee5240 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -45,6 +45,13 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 +/* + * files bigger than this get some pre-flushing when they are added + * to the ordered operations list. That way we limit the total + * work done by the commit + */ +#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) + /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -727,6 +734,15 @@ struct btrfs_fs_info { struct mutex volume_mutex; struct mutex tree_reloc_mutex; + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make + * sure the commit code doesn't find the list temporarily empty + * because another function happens to be doing non-waiting preflush + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -741,9 +757,28 @@ struct btrfs_fs_info { * ordered extents */ spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ struct list_head ordered_extents; + + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ struct list_head delalloc_inodes; + /* + * special rename and truncate targets that must be on disk before + * we're allowed to commit. This is basically the ext3 style + * data=ordered list. + */ + struct list_head ordered_operations; + /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9244cd7313d4..1747dfd18654 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); @@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, insert_inode_hash(fs_info->btree_inode); mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); mutex_init(&fs_info->pinned_mutex); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32d10a617613..9c9fb46ccd08 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1161,6 +1161,20 @@ out_nolock: page_cache_release(pinned[1]); *ppos = pos; + /* + * we want to make sure fsync finds this change + * but we haven't joined a transaction running right now. + * + * Later on, someone is sure to update the inode and get the + * real transid recorded. + * + * We set last_trans now to the fs_info generation + 1, + * this will either be one more than the running transaction + * or the generation used for the next transaction if there isn't + * one running right now. + */ + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + if (num_written > 0 && will_write) { struct btrfs_trans_handle *trans; @@ -1194,6 +1208,18 @@ out_nolock: int btrfs_release_file(struct inode *inode, struct file *filp) { + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (BTRFS_I(inode)->ordered_data_close) { + BTRFS_I(inode)->ordered_data_close = 0; + btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(inode->i_mapping); + } if (filp->private_data) btrfs_ioctl_trans_end(filp); return 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bffd79faffb5..1cff528d5b51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + if (attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } else if (inode->i_size > 0 && + attr->ia_size == 0) { + + /* we're truncating a file that used to have good + * data down to zero. Make sure it gets into + * the ordered flush list so that any new writes + * get down to disk quickly. + */ + BTRFS_I(inode)->ordered_data_close = 1; + } } err = inode_setattr(inode, attr); @@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode) extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -4419,6 +4430,8 @@ again: } ClearPageChecked(page); set_page_dirty(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); + + /* + * setattr is responsible for setting the ordered_data_close flag, + * but that is only tested during the last file release. That + * could happen well after the next commit, leaving a great big + * window where new writes may get lost if someone chooses to write + * to this file after truncating to zero + * + * The inode doesn't have any dirty data here, and so if we commit + * this is a noop. If someone immediately starts writing to the inode + * it is very likely we'll catch some of their writes in this + * transaction, and the commit will find this file on the ordered + * data list with good things to send down. + * + * This is a best effort solution, there is still a window where + * using truncate to replace the contents of the file will + * end up with a zero length file after a crash. + */ + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, inode->i_size); @@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_acl = BTRFS_ACL_NOT_CACHED; ei->i_default_acl = BTRFS_ACL_NOT_CACHED; INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); return &ei->vfs_inode; } void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); @@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode) BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) posix_acl_release(BTRFS_I(inode)->i_default_acl); - spin_lock(&BTRFS_I(inode)->root->list_lock); + /* + * Make sure we're properly removed from the ordered operation + * lists. + */ + smp_mb(); + if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + } + + spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" " list\n", inode->i_ino); dump_stack(); } - spin_unlock(&BTRFS_I(inode)->root->list_lock); + spin_unlock(&root->list_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -4667,8 +4715,27 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_unlock; + /* + * we're using rename to replace one file with another. + * and the replacement file is large. Start IO on it now so + * we don't add too much work to the end of the transaction + */ + if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && + new_inode->i_size && + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + trans = btrfs_start_transaction(root, 1); + /* + * make sure the inode gets flushed if it is replacing + * something. + */ + if (new_inode && new_inode->i_size && + old_inode && S_ISREG(old_inode->i_mode)) { + btrfs_add_ordered_operation(trans, root, old_inode); + } + /* * this is an ugly little race, but the rename is required to make * sure that if we crash, the inode is either at the old name diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 77c2411a5f0f..53c87b197d70 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); + + /* + * we have no more ordered extents for this inode and + * no dirty pages. We can safely remove it from the + * list of ordered extents + */ + if (RB_EMPTY_ROOT(&tree->tree) && + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + list_del_init(&BTRFS_I(inode)->ordered_operations); + } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); mutex_unlock(&tree->mutex); @@ -369,6 +379,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) return 0; } +/* + * this is used during transaction commit to write all the inodes + * added to the ordered operation list. These files must be fully on + * disk before the transaction commits. + * + * we have two modes here, one is to just start the IO via filemap_flush + * and the other is to wait for all the io. When we wait, we have an + * extra check to make sure the ordered operation list really is empty + * before we return + */ +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +{ + struct btrfs_inode *btrfs_inode; + struct inode *inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); +again: + list_splice_init(&root->fs_info->ordered_operations, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + inode = &btrfs_inode->vfs_inode; + + list_del_init(&btrfs_inode->ordered_operations); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(inode); + + if (!wait && inode) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + if (wait) + btrfs_wait_ordered_range(inode, 0, (u64)-1); + else + filemap_flush(inode->i_mapping); + iput(inode); + } + + cond_resched(); + spin_lock(&root->fs_info->ordered_extent_lock); + } + if (wait && !list_empty(&root->fs_info->ordered_operations)) + goto again; + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + /* * Used to start IO or wait for a given ordered extent to finish. * @@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, return ret; } + +/* + * add a given inode to the list of inodes that must be fully on + * disk before a transaction commit finishes. + * + * This basically gives us the ext3 style data=ordered mode, and it is mostly + * used to make sure renamed files are fully on disk. + * + * It is a noop if the inode is already fully on disk. + * + * If trans is not null, we'll do a friendly check for a transaction that + * is already flushing things and force the IO down ourselves. + */ +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + u64 last_mod; + + last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); + + /* + * if this file hasn't been changed since the last transaction + * commit, we can safely return without doing anything + */ + if (last_mod < root->fs_info->last_trans_committed) + return 0; + + /* + * the transaction is already committing. Just start the IO and + * don't bother with all of this list nonsense + */ + if (trans && root->fs_info->running_transaction->blocked) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + return 0; + } + + spin_lock(&root->fs_info->ordered_extent_lock); + if (list_empty(&BTRFS_I(inode)->ordered_operations)) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ab66d5e8d6d6..3d31c8827b01 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9c8f158dd2db..664782c6a2df 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int should_grow = 0; unsigned long now = get_seconds(); + btrfs_run_ordered_operations(root, 0); + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ @@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); } + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + smp_mb(); if (cur_trans->num_writers > 1 || should_grow) schedule_timeout(timeout); From d57e62b89796f751c9422801cbcd407a9f8dcdc4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 31 Mar 2009 13:47:50 -0400 Subject: [PATCH 291/478] Btrfs: try to free metadata pages when we free btree blocks COW means we cycle though blocks fairly quickly, and once we free an extent on disk, it doesn't make much sense to keep the pages around. This commit tries to immediately free the page when we free the extent, which lowers our memory footprint significantly. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0c482e0d7c43..f5e7cae63d80 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2384,6 +2384,10 @@ static int __free_extent(struct btrfs_trans_handle *trans, if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); + } else { + invalidate_mapping_pages(info->btree_inode->i_mapping, + bytenr >> PAGE_CACHE_SHIFT, + (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0, From ae149b6bec64a09373ba20fce75f8aa6b14b78fd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:15 -0700 Subject: [PATCH 292/478] proc tty: add struct tty_operations::proc_fops Used for gradual switch of TTY drivers from using ->read_proc which helps with gradual switch from ->read_proc for the whole tree. As side effect, fix possible race condition when ->data initialized after PDE is hooked into proc tree. ->proc_fops takes precedence over ->read_proc. Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_tty.c | 20 +++++++++++++------- include/linux/tty_driver.h | 1 + 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 4a9e0f65ae60..854827b1d463 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -144,16 +144,22 @@ void proc_tty_register_driver(struct tty_driver *driver) { struct proc_dir_entry *ent; - if (!driver->ops->read_proc || !driver->driver_name || - driver->proc_entry) + if (!driver->driver_name || driver->proc_entry) return; - ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); - if (!ent) + if (driver->ops->proc_fops) { + ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_fops, driver); + if (!ent) + return; + } else if (driver->ops->read_proc) { + ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); + if (!ent) + return; + ent->read_proc = driver->ops->read_proc; + ent->data = driver; + } else return; - ent->read_proc = driver->ops->read_proc; - ent->data = driver; - driver->proc_entry = ent; } diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 08e088334dba..c9a69575ded6 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -264,6 +264,7 @@ struct tty_operations { int (*poll_get_char)(struct tty_driver *driver, int line); void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif + const struct file_operations *proc_fops; }; struct tty_driver { From 444697d61b6d5ae43b317d259db7c362c9d3756a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:15 -0700 Subject: [PATCH 293/478] proc tty: switch cyclades to ->proc_fops Signed-off-by: Alexey Dobriyan Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/cyclades.c | 54 +++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c index 6a59f72a9c21..272db0e2b491 100644 --- a/drivers/char/cyclades.c +++ b/drivers/char/cyclades.c @@ -657,6 +657,7 @@ #include #include +#include static void cy_throttle(struct tty_struct *tty); static void cy_send_xchar(struct tty_struct *tty, char ch); @@ -868,8 +869,6 @@ static int cyz_issue_cmd(struct cyclades_card *, __u32, __u8, __u32); static unsigned detect_isa_irq(void __iomem *); #endif /* CONFIG_ISA */ -static int cyclades_get_proc_info(char *, char **, off_t, int, int *, void *); - #ifndef CONFIG_CYZ_INTR static void cyz_poll(unsigned long); @@ -5216,31 +5215,22 @@ static struct pci_driver cy_pci_driver = { }; #endif -static int -cyclades_get_proc_info(char *buf, char **start, off_t offset, int length, - int *eof, void *data) +static int cyclades_proc_show(struct seq_file *m, void *v) { struct cyclades_port *info; unsigned int i, j; - int len = 0; - off_t begin = 0; - off_t pos = 0; - int size; __u32 cur_jifs = jiffies; - size = sprintf(buf, "Dev TimeOpen BytesOut IdleOut BytesIn " + seq_puts(m, "Dev TimeOpen BytesOut IdleOut BytesIn " "IdleIn Overruns Ldisc\n"); - pos += size; - len += size; - /* Output one line for each known port */ for (i = 0; i < NR_CARDS; i++) for (j = 0; j < cy_card[i].nports; j++) { info = &cy_card[i].ports[j]; if (info->port.count) - size = sprintf(buf + len, "%3d %8lu %10lu %8lu " + seq_printf(m, "%3d %8lu %10lu %8lu " "%10lu %8lu %9lu %6ld\n", info->line, (cur_jifs - info->idle_stats.in_use) / HZ, info->idle_stats.xmit_bytes, @@ -5251,30 +5241,26 @@ cyclades_get_proc_info(char *buf, char **start, off_t offset, int length, /* FIXME: double check locking */ (long)info->port.tty->ldisc.ops->num); else - size = sprintf(buf + len, "%3d %8lu %10lu %8lu " + seq_printf(m, "%3d %8lu %10lu %8lu " "%10lu %8lu %9lu %6ld\n", info->line, 0L, 0L, 0L, 0L, 0L, 0L, 0L); - len += size; - pos = begin + len; - - if (pos < offset) { - len = 0; - begin = pos; - } - if (pos > offset + length) - goto done; } - *eof = 1; -done: - *start = buf + (offset - begin); /* Start of wanted data */ - len -= (offset - begin); /* Start slop */ - if (len > length) - len = length; /* Ending slop */ - if (len < 0) - len = 0; - return len; + return 0; } +static int cyclades_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cyclades_proc_show, NULL); +} + +static const struct file_operations cyclades_proc_fops = { + .owner = THIS_MODULE, + .open = cyclades_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* The serial driver boot-time initialization code! Hardware I/O ports are mapped to character special devices on a first found, first allocated manner. That is, this code searches @@ -5311,9 +5297,9 @@ static const struct tty_operations cy_ops = { .hangup = cy_hangup, .break_ctl = cy_break, .wait_until_sent = cy_wait_until_sent, - .read_proc = cyclades_get_proc_info, .tiocmget = cy_tiocmget, .tiocmset = cy_tiocmset, + .proc_fops = &cyclades_proc_fops, }; static int __init cy_init(void) From cdda7cd92b9c0b8b25c906a1f39c61954432357a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:16 -0700 Subject: [PATCH 294/478] proc tty: switch ip2 to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ip2/ip2main.c | 76 ++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c index 70e0ebc30bd0..afd9247cf082 100644 --- a/drivers/char/ip2/ip2main.c +++ b/drivers/char/ip2/ip2main.c @@ -139,7 +139,7 @@ #include static const struct file_operations ip2mem_proc_fops; -static int ip2_read_proc(char *, char **, off_t, int, int *, void * ); +static const struct file_operations ip2_proc_fops; /********************/ /* Type Definitions */ @@ -446,9 +446,9 @@ static const struct tty_operations ip2_ops = { .stop = ip2_stop, .start = ip2_start, .hangup = ip2_hangup, - .read_proc = ip2_read_proc, .tiocmget = ip2_tiocmget, .tiocmset = ip2_tiocmset, + .proc_fops = &ip2_proc_fops, }; /******************************************************************************/ @@ -3029,19 +3029,17 @@ static const struct file_operations ip2mem_proc_fops = { * different sources including ip2mkdev.c and a couple of other drivers. * The bugs are all mine. :-) =mhw= */ -static int ip2_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int ip2_proc_show(struct seq_file *m, void *v) { int i, j, box; - int len = 0; int boxes = 0; int ports = 0; int tports = 0; - off_t begin = 0; i2eBordStrPtr pB; + char *sep; - len += sprintf(page, "ip2info: 1.0 driver: %s\n", pcVersion ); - len += sprintf(page+len, "Driver: SMajor=%d CMajor=%d IMajor=%d MaxBoards=%d MaxBoxes=%d MaxPorts=%d\n", + seq_printf(m, "ip2info: 1.0 driver: %s\n", pcVersion); + seq_printf(m, "Driver: SMajor=%d CMajor=%d IMajor=%d MaxBoards=%d MaxBoxes=%d MaxPorts=%d\n", IP2_TTY_MAJOR, IP2_CALLOUT_MAJOR, IP2_IPL_MAJOR, IP2_MAX_BOARDS, ABS_MAX_BOXES, ABS_BIGGEST_BOX); @@ -3053,7 +3051,8 @@ static int ip2_read_proc(char *page, char **start, off_t off, switch( pB->i2ePom.e.porID & ~POR_ID_RESERVED ) { case POR_ID_FIIEX: - len += sprintf( page+len, "Board %d: EX ports=", i ); + seq_printf(m, "Board %d: EX ports=", i); + sep = ""; for( box = 0; box < ABS_MAX_BOXES; ++box ) { ports = 0; @@ -3065,79 +3064,74 @@ static int ip2_read_proc(char *page, char **start, off_t off, ++ports; } } - len += sprintf( page+len, "%d,", ports ); + seq_printf(m, "%s%d", sep, ports); + sep = ","; tports += ports; } - - --len; /* Backup over that last comma */ - - len += sprintf( page+len, " boxes=%d width=%d", boxes, pB->i2eDataWidth16 ? 16 : 8 ); + seq_printf(m, " boxes=%d width=%d", boxes, pB->i2eDataWidth16 ? 16 : 8); break; case POR_ID_II_4: - len += sprintf(page+len, "Board %d: ISA-4 ports=4 boxes=1", i ); + seq_printf(m, "Board %d: ISA-4 ports=4 boxes=1", i); tports = ports = 4; break; case POR_ID_II_8: - len += sprintf(page+len, "Board %d: ISA-8-std ports=8 boxes=1", i ); + seq_printf(m, "Board %d: ISA-8-std ports=8 boxes=1", i); tports = ports = 8; break; case POR_ID_II_8R: - len += sprintf(page+len, "Board %d: ISA-8-RJ11 ports=8 boxes=1", i ); + seq_printf(m, "Board %d: ISA-8-RJ11 ports=8 boxes=1", i); tports = ports = 8; break; default: - len += sprintf(page+len, "Board %d: unknown", i ); + seq_printf(m, "Board %d: unknown", i); /* Don't try and probe for minor numbers */ tports = ports = 0; } } else { /* Don't try and probe for minor numbers */ - len += sprintf(page+len, "Board %d: vacant", i ); + seq_printf(m, "Board %d: vacant", i); tports = ports = 0; } if( tports ) { - len += sprintf(page+len, " minors=" ); - + seq_puts(m, " minors="); + sep = ""; for ( box = 0; box < ABS_MAX_BOXES; ++box ) { for ( j = 0; j < ABS_BIGGEST_BOX; ++j ) { if ( pB->i2eChannelMap[box] & (1 << j) ) { - len += sprintf (page+len,"%d,", + seq_printf(m, "%s%d", sep, j + ABS_BIGGEST_BOX * (box+i*ABS_MAX_BOXES)); + sep = ","; } } } - - page[ len - 1 ] = '\n'; /* Overwrite that last comma */ - } else { - len += sprintf (page+len,"\n" ); - } - - if (len+begin > off+count) - break; - if (len+begin < off) { - begin += len; - len = 0; } + seq_putc(m, '\n'); } - - if (i >= IP2_MAX_BOARDS) - *eof = 1; - if (off >= len+begin) - return 0; - - *start = page + (off-begin); - return ((count < begin+len-off) ? count : begin+len-off); + return 0; } + +static int ip2_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip2_proc_show, NULL); +} + +static const struct file_operations ip2_proc_fops = { + .owner = THIS_MODULE, + .open = ip2_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; /******************************************************************************/ /* Function: ip2trace() */ From 5bd6de7dadb8054a558ae4ac29121d8e93493065 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:16 -0700 Subject: [PATCH 295/478] proc tty: switch istallion to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/istallion.c | 121 +++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 69 deletions(-) diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c index 5c3dc6b8411c..fff19f7e29d2 100644 --- a/drivers/char/istallion.c +++ b/drivers/char/istallion.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -613,7 +614,6 @@ static int stli_breakctl(struct tty_struct *tty, int state); static void stli_waituntilsent(struct tty_struct *tty, int timeout); static void stli_sendxchar(struct tty_struct *tty, char ch); static void stli_hangup(struct tty_struct *tty); -static int stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portnr, char *pos); static int stli_brdinit(struct stlibrd *brdp); static int stli_startbrd(struct stlibrd *brdp); @@ -1893,20 +1893,10 @@ static void stli_sendxchar(struct tty_struct *tty, char ch) stli_cmdwait(brdp, portp, A_PORTCTRL, &actrl, sizeof(asyctrl_t), 0); } -/*****************************************************************************/ - -#define MAXLINE 80 - -/* - * Format info for a specified port. The line is deliberately limited - * to 80 characters. (If it is too long it will be truncated, if too - * short then padded with spaces). - */ - -static int stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portnr, char *pos) +static void stli_portinfo(struct seq_file *m, struct stlibrd *brdp, struct stliport *portp, int portnr) { - char *sp, *uart; - int rc, cnt; + char *uart; + int rc; rc = stli_portcmdstats(NULL, portp); @@ -1918,44 +1908,50 @@ static int stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portn default:uart = "CD1400"; break; } } - - sp = pos; - sp += sprintf(sp, "%d: uart:%s ", portnr, uart); + seq_printf(m, "%d: uart:%s ", portnr, uart); if ((brdp->state & BST_STARTED) && (rc >= 0)) { - sp += sprintf(sp, "tx:%d rx:%d", (int) stli_comstats.txtotal, + char sep; + + seq_printf(m, "tx:%d rx:%d", (int) stli_comstats.txtotal, (int) stli_comstats.rxtotal); if (stli_comstats.rxframing) - sp += sprintf(sp, " fe:%d", + seq_printf(m, " fe:%d", (int) stli_comstats.rxframing); if (stli_comstats.rxparity) - sp += sprintf(sp, " pe:%d", + seq_printf(m, " pe:%d", (int) stli_comstats.rxparity); if (stli_comstats.rxbreaks) - sp += sprintf(sp, " brk:%d", + seq_printf(m, " brk:%d", (int) stli_comstats.rxbreaks); if (stli_comstats.rxoverrun) - sp += sprintf(sp, " oe:%d", + seq_printf(m, " oe:%d", (int) stli_comstats.rxoverrun); - cnt = sprintf(sp, "%s%s%s%s%s ", - (stli_comstats.signals & TIOCM_RTS) ? "|RTS" : "", - (stli_comstats.signals & TIOCM_CTS) ? "|CTS" : "", - (stli_comstats.signals & TIOCM_DTR) ? "|DTR" : "", - (stli_comstats.signals & TIOCM_CD) ? "|DCD" : "", - (stli_comstats.signals & TIOCM_DSR) ? "|DSR" : ""); - *sp = ' '; - sp += cnt; + sep = ' '; + if (stli_comstats.signals & TIOCM_RTS) { + seq_printf(m, "%c%s", sep, "RTS"); + sep = '|'; + } + if (stli_comstats.signals & TIOCM_CTS) { + seq_printf(m, "%c%s", sep, "CTS"); + sep = '|'; + } + if (stli_comstats.signals & TIOCM_DTR) { + seq_printf(m, "%c%s", sep, "DTR"); + sep = '|'; + } + if (stli_comstats.signals & TIOCM_CD) { + seq_printf(m, "%c%s", sep, "DCD"); + sep = '|'; + } + if (stli_comstats.signals & TIOCM_DSR) { + seq_printf(m, "%c%s", sep, "DSR"); + sep = '|'; + } } - - for (cnt = (sp - pos); (cnt < (MAXLINE - 1)); cnt++) - *sp++ = ' '; - if (cnt >= MAXLINE) - pos[(MAXLINE - 2)] = '+'; - pos[(MAXLINE - 1)] = '\n'; - - return(MAXLINE); + seq_putc(m, '\n'); } /*****************************************************************************/ @@ -1964,26 +1960,15 @@ static int stli_portinfo(struct stlibrd *brdp, struct stliport *portp, int portn * Port info, read from the /proc file system. */ -static int stli_readproc(char *page, char **start, off_t off, int count, int *eof, void *data) +static int stli_proc_show(struct seq_file *m, void *v) { struct stlibrd *brdp; struct stliport *portp; unsigned int brdnr, portnr, totalport; - int curoff, maxoff; - char *pos; - pos = page; totalport = 0; - curoff = 0; - if (off == 0) { - pos += sprintf(pos, "%s: version %s", stli_drvtitle, - stli_drvversion); - while (pos < (page + MAXLINE - 1)) - *pos++ = ' '; - *pos++ = '\n'; - } - curoff = MAXLINE; + seq_printf(m, "%s: version %s\n", stli_drvtitle, stli_drvversion); /* * We scan through for each board, panel and port. The offset is @@ -1996,33 +1981,31 @@ static int stli_readproc(char *page, char **start, off_t off, int count, int *eo if (brdp->state == 0) continue; - maxoff = curoff + (brdp->nrports * MAXLINE); - if (off >= maxoff) { - curoff = maxoff; - continue; - } - totalport = brdnr * STL_MAXPORTS; for (portnr = 0; (portnr < brdp->nrports); portnr++, totalport++) { portp = brdp->ports[portnr]; if (portp == NULL) continue; - if (off >= (curoff += MAXLINE)) - continue; - if ((pos - page + MAXLINE) > count) - goto stli_readdone; - pos += stli_portinfo(brdp, portp, totalport, pos); + stli_portinfo(m, brdp, portp, totalport); } } - - *eof = 1; - -stli_readdone: - *start = page; - return(pos - page); + return 0; } +static int stli_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, stli_proc_show, NULL); +} + +static const struct file_operations stli_proc_fops = { + .owner = THIS_MODULE, + .open = stli_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /*****************************************************************************/ /* @@ -4427,9 +4410,9 @@ static const struct tty_operations stli_ops = { .break_ctl = stli_breakctl, .wait_until_sent = stli_waituntilsent, .send_xchar = stli_sendxchar, - .read_proc = stli_readproc, .tiocmget = stli_tiocmget, .tiocmset = stli_tiocmset, + .proc_fops = &stli_proc_fops, }; static const struct tty_port_operations stli_port_ops = { From 87687144b4fce2ad083e689eec8b219054c292ae Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:17 -0700 Subject: [PATCH 296/478] proc tty: switch synclink_cs to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/pcmcia/synclink_cs.c | 75 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c index 5608a1e5a3b3..19d79fc54461 100644 --- a/drivers/char/pcmcia/synclink_cs.c +++ b/drivers/char/pcmcia/synclink_cs.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -2619,13 +2620,12 @@ cleanup: * /proc fs routines.... */ -static inline int line_info(char *buf, MGSLPC_INFO *info) +static inline void line_info(struct seq_file *m, MGSLPC_INFO *info) { char stat_buf[30]; - int ret; unsigned long flags; - ret = sprintf(buf, "%s:io:%04X irq:%d", + seq_printf(m, "%s:io:%04X irq:%d", info->device_name, info->io_base, info->irq_level); /* output current serial signal states */ @@ -2649,75 +2649,70 @@ static inline int line_info(char *buf, MGSLPC_INFO *info) strcat(stat_buf, "|RI"); if (info->params.mode == MGSL_MODE_HDLC) { - ret += sprintf(buf+ret, " HDLC txok:%d rxok:%d", + seq_printf(m, " HDLC txok:%d rxok:%d", info->icount.txok, info->icount.rxok); if (info->icount.txunder) - ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder); + seq_printf(m, " txunder:%d", info->icount.txunder); if (info->icount.txabort) - ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort); + seq_printf(m, " txabort:%d", info->icount.txabort); if (info->icount.rxshort) - ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort); + seq_printf(m, " rxshort:%d", info->icount.rxshort); if (info->icount.rxlong) - ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong); + seq_printf(m, " rxlong:%d", info->icount.rxlong); if (info->icount.rxover) - ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover); + seq_printf(m, " rxover:%d", info->icount.rxover); if (info->icount.rxcrc) - ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc); + seq_printf(m, " rxcrc:%d", info->icount.rxcrc); } else { - ret += sprintf(buf+ret, " ASYNC tx:%d rx:%d", + seq_printf(m, " ASYNC tx:%d rx:%d", info->icount.tx, info->icount.rx); if (info->icount.frame) - ret += sprintf(buf+ret, " fe:%d", info->icount.frame); + seq_printf(m, " fe:%d", info->icount.frame); if (info->icount.parity) - ret += sprintf(buf+ret, " pe:%d", info->icount.parity); + seq_printf(m, " pe:%d", info->icount.parity); if (info->icount.brk) - ret += sprintf(buf+ret, " brk:%d", info->icount.brk); + seq_printf(m, " brk:%d", info->icount.brk); if (info->icount.overrun) - ret += sprintf(buf+ret, " oe:%d", info->icount.overrun); + seq_printf(m, " oe:%d", info->icount.overrun); } /* Append serial signal status to end */ - ret += sprintf(buf+ret, " %s\n", stat_buf+1); + seq_printf(m, " %s\n", stat_buf+1); - ret += sprintf(buf+ret, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", + seq_printf(m, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", info->tx_active,info->bh_requested,info->bh_running, info->pending_bh); - - return ret; } /* Called to print information about devices */ -static int mgslpc_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int mgslpc_proc_show(struct seq_file *m, void *v) { - int len = 0, l; - off_t begin = 0; MGSLPC_INFO *info; - len += sprintf(page, "synclink driver:%s\n", driver_version); + seq_printf(m, "synclink driver:%s\n", driver_version); info = mgslpc_device_list; while( info ) { - l = line_info(page + len, info); - len += l; - if (len+begin > off+count) - goto done; - if (len+begin < off) { - begin += len; - len = 0; - } + line_info(m, info); info = info->next_device; } - - *eof = 1; -done: - if (off >= len+begin) - return 0; - *start = page + (off-begin); - return ((count < begin+len-off) ? count : begin+len-off); + return 0; } +static int mgslpc_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, mgslpc_proc_show, NULL); +} + +static const struct file_operations mgslpc_proc_fops = { + .owner = THIS_MODULE, + .open = mgslpc_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int rx_alloc_buffers(MGSLPC_INFO *info) { /* each buffer has header and data */ @@ -2861,13 +2856,13 @@ static const struct tty_operations mgslpc_ops = { .send_xchar = mgslpc_send_xchar, .break_ctl = mgslpc_break, .wait_until_sent = mgslpc_wait_until_sent, - .read_proc = mgslpc_read_proc, .set_termios = mgslpc_set_termios, .stop = tx_pause, .start = tx_release, .hangup = mgslpc_hangup, .tiocmget = tiocmget, .tiocmset = tiocmset, + .proc_fops = &mgslpc_proc_fops, }; static void synclink_cs_cleanup(void) From 8561c44c9e8baf02a9e3018f76c53aa99038a499 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:18 -0700 Subject: [PATCH 297/478] proc tty: switch stallion to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/stallion.c | 126 ++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 76 deletions(-) diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c index e1e0dd89ac9a..2ad813a801dc 100644 --- a/drivers/char/stallion.c +++ b/drivers/char/stallion.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -1379,52 +1380,47 @@ static void stl_sendxchar(struct tty_struct *tty, char ch) stl_putchar(tty, ch); } -/*****************************************************************************/ - -#define MAXLINE 80 - -/* - * Format info for a specified port. The line is deliberately limited - * to 80 characters. (If it is too long it will be truncated, if too - * short then padded with spaces). - */ - -static int stl_portinfo(struct stlport *portp, int portnr, char *pos) +static void stl_portinfo(struct seq_file *m, struct stlport *portp, int portnr) { - char *sp; - int sigs, cnt; + int sigs; + char sep; - sp = pos; - sp += sprintf(sp, "%d: uart:%s tx:%d rx:%d", + seq_printf(m, "%d: uart:%s tx:%d rx:%d", portnr, (portp->hwid == 1) ? "SC26198" : "CD1400", (int) portp->stats.txtotal, (int) portp->stats.rxtotal); if (portp->stats.rxframing) - sp += sprintf(sp, " fe:%d", (int) portp->stats.rxframing); + seq_printf(m, " fe:%d", (int) portp->stats.rxframing); if (portp->stats.rxparity) - sp += sprintf(sp, " pe:%d", (int) portp->stats.rxparity); + seq_printf(m, " pe:%d", (int) portp->stats.rxparity); if (portp->stats.rxbreaks) - sp += sprintf(sp, " brk:%d", (int) portp->stats.rxbreaks); + seq_printf(m, " brk:%d", (int) portp->stats.rxbreaks); if (portp->stats.rxoverrun) - sp += sprintf(sp, " oe:%d", (int) portp->stats.rxoverrun); + seq_printf(m, " oe:%d", (int) portp->stats.rxoverrun); sigs = stl_getsignals(portp); - cnt = sprintf(sp, "%s%s%s%s%s ", - (sigs & TIOCM_RTS) ? "|RTS" : "", - (sigs & TIOCM_CTS) ? "|CTS" : "", - (sigs & TIOCM_DTR) ? "|DTR" : "", - (sigs & TIOCM_CD) ? "|DCD" : "", - (sigs & TIOCM_DSR) ? "|DSR" : ""); - *sp = ' '; - sp += cnt; - - for (cnt = sp - pos; cnt < (MAXLINE - 1); cnt++) - *sp++ = ' '; - if (cnt >= MAXLINE) - pos[(MAXLINE - 2)] = '+'; - pos[(MAXLINE - 1)] = '\n'; - - return MAXLINE; + sep = ' '; + if (sigs & TIOCM_RTS) { + seq_printf(m, "%c%s", sep, "RTS"); + sep = '|'; + } + if (sigs & TIOCM_CTS) { + seq_printf(m, "%c%s", sep, "CTS"); + sep = '|'; + } + if (sigs & TIOCM_DTR) { + seq_printf(m, "%c%s", sep, "DTR"); + sep = '|'; + } + if (sigs & TIOCM_CD) { + seq_printf(m, "%c%s", sep, "DCD"); + sep = '|'; + } + if (sigs & TIOCM_DSR) { + seq_printf(m, "%c%s", sep, "DSR"); + sep = '|'; + } + seq_putc(m, '\n'); } /*****************************************************************************/ @@ -1433,30 +1429,17 @@ static int stl_portinfo(struct stlport *portp, int portnr, char *pos) * Port info, read from the /proc file system. */ -static int stl_readproc(char *page, char **start, off_t off, int count, int *eof, void *data) +static int stl_proc_show(struct seq_file *m, void *v) { struct stlbrd *brdp; struct stlpanel *panelp; struct stlport *portp; unsigned int brdnr, panelnr, portnr; - int totalport, curoff, maxoff; - char *pos; + int totalport; - pr_debug("stl_readproc(page=%p,start=%p,off=%lx,count=%d,eof=%p," - "data=%p\n", page, start, off, count, eof, data); - - pos = page; totalport = 0; - curoff = 0; - if (off == 0) { - pos += sprintf(pos, "%s: version %s", stl_drvtitle, - stl_drvversion); - while (pos < (page + MAXLINE - 1)) - *pos++ = ' '; - *pos++ = '\n'; - } - curoff = MAXLINE; + seq_printf(m, "%s: version %s\n", stl_drvtitle, stl_drvversion); /* * We scan through for each board, panel and port. The offset is @@ -1469,46 +1452,37 @@ static int stl_readproc(char *page, char **start, off_t off, int count, int *eof if (brdp->state == 0) continue; - maxoff = curoff + (brdp->nrports * MAXLINE); - if (off >= maxoff) { - curoff = maxoff; - continue; - } - totalport = brdnr * STL_MAXPORTS; for (panelnr = 0; panelnr < brdp->nrpanels; panelnr++) { panelp = brdp->panels[panelnr]; if (panelp == NULL) continue; - maxoff = curoff + (panelp->nrports * MAXLINE); - if (off >= maxoff) { - curoff = maxoff; - totalport += panelp->nrports; - continue; - } - for (portnr = 0; portnr < panelp->nrports; portnr++, totalport++) { portp = panelp->ports[portnr]; if (portp == NULL) continue; - if (off >= (curoff += MAXLINE)) - continue; - if ((pos - page + MAXLINE) > count) - goto stl_readdone; - pos += stl_portinfo(portp, totalport, pos); + stl_portinfo(m, portp, totalport); } } } - - *eof = 1; - -stl_readdone: - *start = page; - return pos - page; + return 0; } +static int stl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, stl_proc_show, NULL); +} + +static const struct file_operations stl_proc_fops = { + .owner = THIS_MODULE, + .open = stl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /*****************************************************************************/ /* @@ -2566,9 +2540,9 @@ static const struct tty_operations stl_ops = { .break_ctl = stl_breakctl, .wait_until_sent = stl_waituntilsent, .send_xchar = stl_sendxchar, - .read_proc = stl_readproc, .tiocmget = stl_tiocmget, .tiocmset = stl_tiocmset, + .proc_fops = &stl_proc_fops, }; static const struct tty_port_operations stl_port_ops = { From d337829bd841974045846ec8b428f4199453159e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:18 -0700 Subject: [PATCH 298/478] proc tty: switch synclink to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/synclink.c | 98 ++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 59 deletions(-) diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c index 0057a8f58cb1..afd0b26ca056 100644 --- a/drivers/char/synclink.c +++ b/drivers/char/synclink.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -3459,18 +3460,17 @@ cleanup: * /proc fs routines.... */ -static inline int line_info(char *buf, struct mgsl_struct *info) +static inline void line_info(struct seq_file *m, struct mgsl_struct *info) { char stat_buf[30]; - int ret; unsigned long flags; if (info->bus_type == MGSL_BUS_TYPE_PCI) { - ret = sprintf(buf, "%s:PCI io:%04X irq:%d mem:%08X lcr:%08X", + seq_printf(m, "%s:PCI io:%04X irq:%d mem:%08X lcr:%08X", info->device_name, info->io_base, info->irq_level, info->phys_memory_base, info->phys_lcr_base); } else { - ret = sprintf(buf, "%s:(E)ISA io:%04X irq:%d dma:%d", + seq_printf(m, "%s:(E)ISA io:%04X irq:%d dma:%d", info->device_name, info->io_base, info->irq_level, info->dma_level); } @@ -3497,37 +3497,37 @@ static inline int line_info(char *buf, struct mgsl_struct *info) if (info->params.mode == MGSL_MODE_HDLC || info->params.mode == MGSL_MODE_RAW ) { - ret += sprintf(buf+ret, " HDLC txok:%d rxok:%d", + seq_printf(m, " HDLC txok:%d rxok:%d", info->icount.txok, info->icount.rxok); if (info->icount.txunder) - ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder); + seq_printf(m, " txunder:%d", info->icount.txunder); if (info->icount.txabort) - ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort); + seq_printf(m, " txabort:%d", info->icount.txabort); if (info->icount.rxshort) - ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort); + seq_printf(m, " rxshort:%d", info->icount.rxshort); if (info->icount.rxlong) - ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong); + seq_printf(m, " rxlong:%d", info->icount.rxlong); if (info->icount.rxover) - ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover); + seq_printf(m, " rxover:%d", info->icount.rxover); if (info->icount.rxcrc) - ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc); + seq_printf(m, " rxcrc:%d", info->icount.rxcrc); } else { - ret += sprintf(buf+ret, " ASYNC tx:%d rx:%d", + seq_printf(m, " ASYNC tx:%d rx:%d", info->icount.tx, info->icount.rx); if (info->icount.frame) - ret += sprintf(buf+ret, " fe:%d", info->icount.frame); + seq_printf(m, " fe:%d", info->icount.frame); if (info->icount.parity) - ret += sprintf(buf+ret, " pe:%d", info->icount.parity); + seq_printf(m, " pe:%d", info->icount.parity); if (info->icount.brk) - ret += sprintf(buf+ret, " brk:%d", info->icount.brk); + seq_printf(m, " brk:%d", info->icount.brk); if (info->icount.overrun) - ret += sprintf(buf+ret, " oe:%d", info->icount.overrun); + seq_printf(m, " oe:%d", info->icount.overrun); } /* Append serial signal status to end */ - ret += sprintf(buf+ret, " %s\n", stat_buf+1); + seq_printf(m, " %s\n", stat_buf+1); - ret += sprintf(buf+ret, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", + seq_printf(m, "txactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", info->tx_active,info->bh_requested,info->bh_running, info->pending_bh); @@ -3544,60 +3544,40 @@ static inline int line_info(char *buf, struct mgsl_struct *info) u16 Tmr = usc_InReg( info, TMR ); u16 Tccr = usc_InReg( info, TCCR ); u16 Ccar = inw( info->io_base + CCAR ); - ret += sprintf(buf+ret, "tcsr=%04X tdmr=%04X ticr=%04X rcsr=%04X rdmr=%04X\n" + seq_printf(m, "tcsr=%04X tdmr=%04X ticr=%04X rcsr=%04X rdmr=%04X\n" "ricr=%04X icr =%04X dccr=%04X tmr=%04X tccr=%04X ccar=%04X\n", Tcsr,Tdmr,Ticr,Rscr,Rdmr,Ricr,Icr,Dccr,Tmr,Tccr,Ccar ); } spin_unlock_irqrestore(&info->irq_spinlock,flags); - - return ret; - -} /* end of line_info() */ +} -/* mgsl_read_proc() - * - * Called to print information about devices - * - * Arguments: - * page page of memory to hold returned info - * start - * off - * count - * eof - * data - * - * Return Value: - */ -static int mgsl_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +/* Called to print information about devices */ +static int mgsl_proc_show(struct seq_file *m, void *v) { - int len = 0, l; - off_t begin = 0; struct mgsl_struct *info; - len += sprintf(page, "synclink driver:%s\n", driver_version); + seq_printf(m, "synclink driver:%s\n", driver_version); info = mgsl_device_list; while( info ) { - l = line_info(page + len, info); - len += l; - if (len+begin > off+count) - goto done; - if (len+begin < off) { - begin += len; - len = 0; - } + line_info(m, info); info = info->next_device; } + return 0; +} - *eof = 1; -done: - if (off >= len+begin) - return 0; - *start = page + (off-begin); - return ((count < begin+len-off) ? count : begin+len-off); - -} /* end of mgsl_read_proc() */ +static int mgsl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, mgsl_proc_show, NULL); +} + +static const struct file_operations mgsl_proc_fops = { + .owner = THIS_MODULE, + .open = mgsl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; /* mgsl_allocate_dma_buffers() * @@ -4335,13 +4315,13 @@ static const struct tty_operations mgsl_ops = { .send_xchar = mgsl_send_xchar, .break_ctl = mgsl_break, .wait_until_sent = mgsl_wait_until_sent, - .read_proc = mgsl_read_proc, .set_termios = mgsl_set_termios, .stop = mgsl_stop, .start = mgsl_start, .hangup = mgsl_hangup, .tiocmget = tiocmget, .tiocmset = tiocmset, + .proc_fops = &mgsl_proc_fops, }; /* From a18c56e5af41a6391a6bee2c26e806e7997f6698 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:19 -0700 Subject: [PATCH 299/478] proc tty: switch synclink_gt to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/synclink_gt.c | 76 ++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c index efb3dc928a43..6ec6e13d47d7 100644 --- a/drivers/char/synclink_gt.c +++ b/drivers/char/synclink_gt.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -154,7 +155,6 @@ static void tx_hold(struct tty_struct *tty); static void tx_release(struct tty_struct *tty); static int ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); -static int read_proc(char *page, char **start, off_t off, int count,int *eof, void *data); static int chars_in_buffer(struct tty_struct *tty); static void throttle(struct tty_struct * tty); static void unthrottle(struct tty_struct * tty); @@ -1229,13 +1229,12 @@ static long slgt_compat_ioctl(struct tty_struct *tty, struct file *file, /* * proc fs support */ -static inline int line_info(char *buf, struct slgt_info *info) +static inline void line_info(struct seq_file *m, struct slgt_info *info) { char stat_buf[30]; - int ret; unsigned long flags; - ret = sprintf(buf, "%s: IO=%08X IRQ=%d MaxFrameSize=%u\n", + seq_printf(m, "%s: IO=%08X IRQ=%d MaxFrameSize=%u\n", info->device_name, info->phys_reg_addr, info->irq_level, info->max_frame_size); @@ -1260,75 +1259,70 @@ static inline int line_info(char *buf, struct slgt_info *info) strcat(stat_buf, "|RI"); if (info->params.mode != MGSL_MODE_ASYNC) { - ret += sprintf(buf+ret, "\tHDLC txok:%d rxok:%d", + seq_printf(m, "\tHDLC txok:%d rxok:%d", info->icount.txok, info->icount.rxok); if (info->icount.txunder) - ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder); + seq_printf(m, " txunder:%d", info->icount.txunder); if (info->icount.txabort) - ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort); + seq_printf(m, " txabort:%d", info->icount.txabort); if (info->icount.rxshort) - ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort); + seq_printf(m, " rxshort:%d", info->icount.rxshort); if (info->icount.rxlong) - ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong); + seq_printf(m, " rxlong:%d", info->icount.rxlong); if (info->icount.rxover) - ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover); + seq_printf(m, " rxover:%d", info->icount.rxover); if (info->icount.rxcrc) - ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc); + seq_printf(m, " rxcrc:%d", info->icount.rxcrc); } else { - ret += sprintf(buf+ret, "\tASYNC tx:%d rx:%d", + seq_printf(m, "\tASYNC tx:%d rx:%d", info->icount.tx, info->icount.rx); if (info->icount.frame) - ret += sprintf(buf+ret, " fe:%d", info->icount.frame); + seq_printf(m, " fe:%d", info->icount.frame); if (info->icount.parity) - ret += sprintf(buf+ret, " pe:%d", info->icount.parity); + seq_printf(m, " pe:%d", info->icount.parity); if (info->icount.brk) - ret += sprintf(buf+ret, " brk:%d", info->icount.brk); + seq_printf(m, " brk:%d", info->icount.brk); if (info->icount.overrun) - ret += sprintf(buf+ret, " oe:%d", info->icount.overrun); + seq_printf(m, " oe:%d", info->icount.overrun); } /* Append serial signal status to end */ - ret += sprintf(buf+ret, " %s\n", stat_buf+1); + seq_printf(m, " %s\n", stat_buf+1); - ret += sprintf(buf+ret, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", + seq_printf(m, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", info->tx_active,info->bh_requested,info->bh_running, info->pending_bh); - - return ret; } /* Called to print information about devices */ -static int read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int synclink_gt_proc_show(struct seq_file *m, void *v) { - int len = 0, l; - off_t begin = 0; struct slgt_info *info; - len += sprintf(page, "synclink_gt driver\n"); + seq_puts(m, "synclink_gt driver\n"); info = slgt_device_list; while( info ) { - l = line_info(page + len, info); - len += l; - if (len+begin > off+count) - goto done; - if (len+begin < off) { - begin += len; - len = 0; - } + line_info(m, info); info = info->next_device; } - - *eof = 1; -done: - if (off >= len+begin) - return 0; - *start = page + (off-begin); - return ((count < begin+len-off) ? count : begin+len-off); + return 0; } +static int synclink_gt_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, synclink_gt_proc_show, NULL); +} + +static const struct file_operations synclink_gt_proc_fops = { + .owner = THIS_MODULE, + .open = synclink_gt_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * return count of bytes in transmit buffer */ @@ -3562,13 +3556,13 @@ static const struct tty_operations ops = { .send_xchar = send_xchar, .break_ctl = set_break, .wait_until_sent = wait_until_sent, - .read_proc = read_proc, .set_termios = set_termios, .stop = tx_hold, .start = tx_release, .hangup = hangup, .tiocmget = tiocmget, .tiocmset = tiocmset, + .proc_fops = &synclink_gt_proc_fops, }; static void slgt_cleanup(void) From e6c8dd8a5c887caaf6ee29f04c7260617cb28295 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:20 -0700 Subject: [PATCH 300/478] proc tty: switch synclinkmp to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/synclinkmp.c | 76 ++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c index 8eb6c89a980e..26de60efe4b2 100644 --- a/drivers/char/synclinkmp.c +++ b/drivers/char/synclinkmp.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -520,7 +521,6 @@ static void tx_hold(struct tty_struct *tty); static void tx_release(struct tty_struct *tty); static int ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); -static int read_proc(char *page, char **start, off_t off, int count,int *eof, void *data); static int chars_in_buffer(struct tty_struct *tty); static void throttle(struct tty_struct * tty); static void unthrottle(struct tty_struct * tty); @@ -1354,13 +1354,12 @@ static int ioctl(struct tty_struct *tty, struct file *file, * /proc fs routines.... */ -static inline int line_info(char *buf, SLMP_INFO *info) +static inline void line_info(struct seq_file *m, SLMP_INFO *info) { char stat_buf[30]; - int ret; unsigned long flags; - ret = sprintf(buf, "%s: SCABase=%08x Mem=%08X StatusControl=%08x LCR=%08X\n" + seq_printf(m, "%s: SCABase=%08x Mem=%08X StatusControl=%08x LCR=%08X\n" "\tIRQ=%d MaxFrameSize=%u\n", info->device_name, info->phys_sca_base, @@ -1391,75 +1390,70 @@ static inline int line_info(char *buf, SLMP_INFO *info) strcat(stat_buf, "|RI"); if (info->params.mode == MGSL_MODE_HDLC) { - ret += sprintf(buf+ret, "\tHDLC txok:%d rxok:%d", + seq_printf(m, "\tHDLC txok:%d rxok:%d", info->icount.txok, info->icount.rxok); if (info->icount.txunder) - ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder); + seq_printf(m, " txunder:%d", info->icount.txunder); if (info->icount.txabort) - ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort); + seq_printf(m, " txabort:%d", info->icount.txabort); if (info->icount.rxshort) - ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort); + seq_printf(m, " rxshort:%d", info->icount.rxshort); if (info->icount.rxlong) - ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong); + seq_printf(m, " rxlong:%d", info->icount.rxlong); if (info->icount.rxover) - ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover); + seq_printf(m, " rxover:%d", info->icount.rxover); if (info->icount.rxcrc) - ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxcrc); + seq_printf(m, " rxlong:%d", info->icount.rxcrc); } else { - ret += sprintf(buf+ret, "\tASYNC tx:%d rx:%d", + seq_printf(m, "\tASYNC tx:%d rx:%d", info->icount.tx, info->icount.rx); if (info->icount.frame) - ret += sprintf(buf+ret, " fe:%d", info->icount.frame); + seq_printf(m, " fe:%d", info->icount.frame); if (info->icount.parity) - ret += sprintf(buf+ret, " pe:%d", info->icount.parity); + seq_printf(m, " pe:%d", info->icount.parity); if (info->icount.brk) - ret += sprintf(buf+ret, " brk:%d", info->icount.brk); + seq_printf(m, " brk:%d", info->icount.brk); if (info->icount.overrun) - ret += sprintf(buf+ret, " oe:%d", info->icount.overrun); + seq_printf(m, " oe:%d", info->icount.overrun); } /* Append serial signal status to end */ - ret += sprintf(buf+ret, " %s\n", stat_buf+1); + seq_printf(m, " %s\n", stat_buf+1); - ret += sprintf(buf+ret, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", + seq_printf(m, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n", info->tx_active,info->bh_requested,info->bh_running, info->pending_bh); - - return ret; } /* Called to print information about devices */ -static int read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int synclinkmp_proc_show(struct seq_file *m, void *v) { - int len = 0, l; - off_t begin = 0; SLMP_INFO *info; - len += sprintf(page, "synclinkmp driver:%s\n", driver_version); + seq_printf(m, "synclinkmp driver:%s\n", driver_version); info = synclinkmp_device_list; while( info ) { - l = line_info(page + len, info); - len += l; - if (len+begin > off+count) - goto done; - if (len+begin < off) { - begin += len; - len = 0; - } + line_info(m, info); info = info->next_device; } - - *eof = 1; -done: - if (off >= len+begin) - return 0; - *start = page + (off-begin); - return ((count < begin+len-off) ? count : begin+len-off); + return 0; } +static int synclinkmp_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, synclinkmp_proc_show, NULL); +} + +static const struct file_operations synclinkmp_proc_fops = { + .owner = THIS_MODULE, + .open = synclinkmp_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* Return the count of bytes in transmit buffer */ static int chars_in_buffer(struct tty_struct *tty) @@ -3905,13 +3899,13 @@ static const struct tty_operations ops = { .send_xchar = send_xchar, .break_ctl = set_break, .wait_until_sent = wait_until_sent, - .read_proc = read_proc, .set_termios = set_termios, .stop = tx_hold, .start = tx_release, .hangup = hangup, .tiocmget = tiocmget, .tiocmset = tiocmset, + .proc_fops = &synclinkmp_proc_fops, }; From 201a50ba6627dd00aa7b7673a5c454ca387095fb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:20 -0700 Subject: [PATCH 301/478] proc tty: switch sdio_uart to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mmc/card/sdio_uart.c | 64 +++++++++++++++++------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/drivers/mmc/card/sdio_uart.c b/drivers/mmc/card/sdio_uart.c index 78ad48718ab0..36a8d53ad2a2 100644 --- a/drivers/mmc/card/sdio_uart.c +++ b/drivers/mmc/card/sdio_uart.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -933,67 +934,64 @@ static int sdio_uart_tiocmset(struct tty_struct *tty, struct file *file, return result; } -static int sdio_uart_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int sdio_uart_proc_show(struct seq_file *m, void *v) { - int i, len = 0; - off_t begin = 0; + int i; - len += sprintf(page, "serinfo:1.0 driver%s%s revision:%s\n", + seq_printf(m, "serinfo:1.0 driver%s%s revision:%s\n", "", "", ""); - for (i = 0; i < UART_NR && len < PAGE_SIZE - 96; i++) { + for (i = 0; i < UART_NR; i++) { struct sdio_uart_port *port = sdio_uart_port_get(i); if (port) { - len += sprintf(page+len, "%d: uart:SDIO", i); + seq_printf(m, "%d: uart:SDIO", i); if(capable(CAP_SYS_ADMIN)) { - len += sprintf(page + len, " tx:%d rx:%d", + seq_printf(m, " tx:%d rx:%d", port->icount.tx, port->icount.rx); if (port->icount.frame) - len += sprintf(page + len, " fe:%d", + seq_printf(m, " fe:%d", port->icount.frame); if (port->icount.parity) - len += sprintf(page + len, " pe:%d", + seq_printf(m, " pe:%d", port->icount.parity); if (port->icount.brk) - len += sprintf(page + len, " brk:%d", + seq_printf(m, " brk:%d", port->icount.brk); if (port->icount.overrun) - len += sprintf(page + len, " oe:%d", + seq_printf(m, " oe:%d", port->icount.overrun); if (port->icount.cts) - len += sprintf(page + len, " cts:%d", + seq_printf(m, " cts:%d", port->icount.cts); if (port->icount.dsr) - len += sprintf(page + len, " dsr:%d", + seq_printf(m, " dsr:%d", port->icount.dsr); if (port->icount.rng) - len += sprintf(page + len, " rng:%d", + seq_printf(m, " rng:%d", port->icount.rng); if (port->icount.dcd) - len += sprintf(page + len, " dcd:%d", + seq_printf(m, " dcd:%d", port->icount.dcd); } - strcat(page, "\n"); - len++; sdio_uart_port_put(port); - } - - if (len + begin > off + count) - goto done; - if (len + begin < off) { - begin += len; - len = 0; + seq_putc(m, '\n'); } } - *eof = 1; - -done: - if (off >= len + begin) - return 0; - *start = page + (off - begin); - return (count < begin + len - off) ? count : (begin + len - off); + return 0; } +static int sdio_uart_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, sdio_uart_proc_show, NULL); +} + +static const struct file_operations sdio_uart_proc_fops = { + .owner = THIS_MODULE, + .open = sdio_uart_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static const struct tty_operations sdio_uart_ops = { .open = sdio_uart_open, .close = sdio_uart_close, @@ -1007,7 +1005,7 @@ static const struct tty_operations sdio_uart_ops = { .break_ctl = sdio_uart_break_ctl, .tiocmget = sdio_uart_tiocmget, .tiocmset = sdio_uart_tiocmset, - .read_proc = sdio_uart_read_proc, + .proc_fops = &sdio_uart_proc_fops, }; static struct tty_driver *sdio_uart_tty_driver; From d196a949ba0fb85121c0dc0720b13380d802dbd6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:21 -0700 Subject: [PATCH 302/478] proc tty: switch serial_core to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/serial_core.c | 76 +++++++++++++++++------------------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c index 42f4e66fccaf..bf3c0e32a334 100644 --- a/drivers/serial/serial_core.c +++ b/drivers/serial/serial_core.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include #include @@ -1682,20 +1684,20 @@ static const char *uart_type(struct uart_port *port) #ifdef CONFIG_PROC_FS -static int uart_line_info(char *buf, struct uart_driver *drv, int i) +static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i) { struct uart_state *state = drv->state + i; int pm_state; struct uart_port *port = state->port; char stat_buf[32]; unsigned int status; - int mmio, ret; + int mmio; if (!port) - return 0; + return; mmio = port->iotype >= UPIO_MEM; - ret = sprintf(buf, "%d: uart:%s %s%08llX irq:%d", + seq_printf(m, "%d: uart:%s %s%08llX irq:%d", port->line, uart_type(port), mmio ? "mmio:0x" : "port:", mmio ? (unsigned long long)port->mapbase @@ -1703,8 +1705,8 @@ static int uart_line_info(char *buf, struct uart_driver *drv, int i) port->irq); if (port->type == PORT_UNKNOWN) { - strcat(buf, "\n"); - return ret + 1; + seq_putc(m, '\n'); + return; } if (capable(CAP_SYS_ADMIN)) { @@ -1719,19 +1721,19 @@ static int uart_line_info(char *buf, struct uart_driver *drv, int i) uart_change_pm(state, pm_state); mutex_unlock(&state->mutex); - ret += sprintf(buf + ret, " tx:%d rx:%d", + seq_printf(m, " tx:%d rx:%d", port->icount.tx, port->icount.rx); if (port->icount.frame) - ret += sprintf(buf + ret, " fe:%d", + seq_printf(m, " fe:%d", port->icount.frame); if (port->icount.parity) - ret += sprintf(buf + ret, " pe:%d", + seq_printf(m, " pe:%d", port->icount.parity); if (port->icount.brk) - ret += sprintf(buf + ret, " brk:%d", + seq_printf(m, " brk:%d", port->icount.brk); if (port->icount.overrun) - ret += sprintf(buf + ret, " oe:%d", + seq_printf(m, " oe:%d", port->icount.overrun); #define INFOBIT(bit, str) \ @@ -1753,45 +1755,39 @@ static int uart_line_info(char *buf, struct uart_driver *drv, int i) STATBIT(TIOCM_RNG, "|RI"); if (stat_buf[0]) stat_buf[0] = ' '; - strcat(stat_buf, "\n"); - ret += sprintf(buf + ret, stat_buf); - } else { - strcat(buf, "\n"); - ret++; + seq_puts(m, stat_buf); } + seq_putc(m, '\n'); #undef STATBIT #undef INFOBIT - return ret; } -static int uart_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int uart_proc_show(struct seq_file *m, void *v) { - struct tty_driver *ttydrv = data; + struct tty_driver *ttydrv = v; struct uart_driver *drv = ttydrv->driver_state; - int i, len = 0, l; - off_t begin = 0; + int i; - len += sprintf(page, "serinfo:1.0 driver%s%s revision:%s\n", + seq_printf(m, "serinfo:1.0 driver%s%s revision:%s\n", "", "", ""); - for (i = 0; i < drv->nr && len < PAGE_SIZE - 96; i++) { - l = uart_line_info(page + len, drv, i); - len += l; - if (len + begin > off + count) - goto done; - if (len + begin < off) { - begin += len; - len = 0; - } - } - *eof = 1; - done: - if (off >= len + begin) - return 0; - *start = page + (off - begin); - return (count < begin + len - off) ? count : (begin + len - off); + for (i = 0; i < drv->nr; i++) + uart_line_info(m, drv, i); + return 0; } + +static int uart_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, uart_proc_show, PDE(inode)->data); +} + +static const struct file_operations uart_proc_fops = { + .owner = THIS_MODULE, + .open = uart_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif #if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(CONFIG_CONSOLE_POLL) @@ -2299,7 +2295,7 @@ static const struct tty_operations uart_ops = { .break_ctl = uart_break_ctl, .wait_until_sent= uart_wait_until_sent, #ifdef CONFIG_PROC_FS - .read_proc = uart_read_proc, + .proc_fops = &uart_proc_fops, #endif .tiocmget = uart_tiocmget, .tiocmset = uart_tiocmset, From 6fd69d3cf1496c8e6751ecb3eae254e1a839bd5d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:21 -0700 Subject: [PATCH 303/478] proc tty: switch usb-serial to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/usb/serial/usb-serial.c | 58 +++++++++++++++------------------ 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 742a5bc44be8..2a70563bbee1 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -421,57 +422,52 @@ static int serial_break(struct tty_struct *tty, int break_state) return 0; } -static int serial_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int serial_proc_show(struct seq_file *m, void *v) { struct usb_serial *serial; - int length = 0; int i; - off_t begin = 0; char tmp[40]; dbg("%s", __func__); - length += sprintf(page, "usbserinfo:1.0 driver:2.0\n"); - for (i = 0; i < SERIAL_TTY_MINORS && length < PAGE_SIZE; ++i) { + seq_puts(m, "usbserinfo:1.0 driver:2.0\n"); + for (i = 0; i < SERIAL_TTY_MINORS; ++i) { serial = usb_serial_get_by_index(i); if (serial == NULL) continue; - length += sprintf(page+length, "%d:", i); + seq_printf(m, "%d:", i); if (serial->type->driver.owner) - length += sprintf(page+length, " module:%s", + seq_printf(m, " module:%s", module_name(serial->type->driver.owner)); - length += sprintf(page+length, " name:\"%s\"", + seq_printf(m, " name:\"%s\"", serial->type->description); - length += sprintf(page+length, " vendor:%04x product:%04x", + seq_printf(m, " vendor:%04x product:%04x", le16_to_cpu(serial->dev->descriptor.idVendor), le16_to_cpu(serial->dev->descriptor.idProduct)); - length += sprintf(page+length, " num_ports:%d", - serial->num_ports); - length += sprintf(page+length, " port:%d", - i - serial->minor + 1); + seq_printf(m, " num_ports:%d", serial->num_ports); + seq_printf(m, " port:%d", i - serial->minor + 1); usb_make_path(serial->dev, tmp, sizeof(tmp)); - length += sprintf(page+length, " path:%s", tmp); + seq_printf(m, " path:%s", tmp); - length += sprintf(page+length, "\n"); - if ((length + begin) > (off + count)) { - usb_serial_put(serial); - goto done; - } - if ((length + begin) < off) { - begin += length; - length = 0; - } + seq_putc(m, '\n'); usb_serial_put(serial); } - *eof = 1; -done: - if (off >= (length + begin)) - return 0; - *start = page + (off-begin); - return (count < begin+length-off) ? count : begin+length-off; + return 0; } +static int serial_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, serial_proc_show, NULL); +} + +static const struct file_operations serial_proc_fops = { + .owner = THIS_MODULE, + .open = serial_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int serial_tiocmget(struct tty_struct *tty, struct file *file) { struct usb_serial_port *port = tty->driver_data; @@ -1113,9 +1109,9 @@ static const struct tty_operations serial_ops = { .unthrottle = serial_unthrottle, .break_ctl = serial_break, .chars_in_buffer = serial_chars_in_buffer, - .read_proc = serial_read_proc, .tiocmget = serial_tiocmget, .tiocmset = serial_tiocmset, + .proc_fops = &serial_proc_fops, }; struct tty_driver *usb_serial_tty_driver; From 3d3041768296c4993c0aba686bf0775faab5236a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:22 -0700 Subject: [PATCH 304/478] proc tty: switch ircomm to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/irda/ircomm/ircomm_tty.c | 242 +++++++++++++++++++---------------- 1 file changed, 131 insertions(+), 111 deletions(-) diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 086d5ef098fd..811984d9324b 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -72,8 +73,7 @@ static int ircomm_tty_control_indication(void *instance, void *sap, static void ircomm_tty_flow_indication(void *instance, void *sap, LOCAL_FLOW cmd); #ifdef CONFIG_PROC_FS -static int ircomm_tty_read_proc(char *buf, char **start, off_t offset, int len, - int *eof, void *unused); +static const struct file_operations ircomm_tty_proc_fops; #endif /* CONFIG_PROC_FS */ static struct tty_driver *driver; @@ -98,7 +98,7 @@ static const struct tty_operations ops = { .hangup = ircomm_tty_hangup, .wait_until_sent = ircomm_tty_wait_until_sent, #ifdef CONFIG_PROC_FS - .read_proc = ircomm_tty_read_proc, + .proc_fops = &ircomm_tty_proc_fops, #endif /* CONFIG_PROC_FS */ }; @@ -1245,150 +1245,170 @@ static void ircomm_tty_flow_indication(void *instance, void *sap, } #ifdef CONFIG_PROC_FS -static int ircomm_tty_line_info(struct ircomm_tty_cb *self, char *buf) +static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m) { - int ret=0; + char sep; - ret += sprintf(buf+ret, "State: %s\n", ircomm_tty_state[self->state]); + seq_printf(m, "State: %s\n", ircomm_tty_state[self->state]); - ret += sprintf(buf+ret, "Service type: "); + seq_puts(m, "Service type: "); if (self->service_type & IRCOMM_9_WIRE) - ret += sprintf(buf+ret, "9_WIRE"); + seq_puts(m, "9_WIRE"); else if (self->service_type & IRCOMM_3_WIRE) - ret += sprintf(buf+ret, "3_WIRE"); + seq_puts(m, "3_WIRE"); else if (self->service_type & IRCOMM_3_WIRE_RAW) - ret += sprintf(buf+ret, "3_WIRE_RAW"); + seq_puts(m, "3_WIRE_RAW"); else - ret += sprintf(buf+ret, "No common service type!\n"); - ret += sprintf(buf+ret, "\n"); + seq_puts(m, "No common service type!\n"); + seq_putc(m, '\n'); - ret += sprintf(buf+ret, "Port name: %s\n", self->settings.port_name); + seq_printf(m, "Port name: %s\n", self->settings.port_name); - ret += sprintf(buf+ret, "DTE status: "); - if (self->settings.dte & IRCOMM_RTS) - ret += sprintf(buf+ret, "RTS|"); - if (self->settings.dte & IRCOMM_DTR) - ret += sprintf(buf+ret, "DTR|"); - if (self->settings.dte) - ret--; /* remove the last | */ - ret += sprintf(buf+ret, "\n"); + seq_printf(m, "DTE status:"); + sep = ' '; + if (self->settings.dte & IRCOMM_RTS) { + seq_printf(m, "%cRTS", sep); + sep = '|'; + } + if (self->settings.dte & IRCOMM_DTR) { + seq_printf(m, "%cDTR", sep); + sep = '|'; + } + seq_putc(m, '\n'); - ret += sprintf(buf+ret, "DCE status: "); - if (self->settings.dce & IRCOMM_CTS) - ret += sprintf(buf+ret, "CTS|"); - if (self->settings.dce & IRCOMM_DSR) - ret += sprintf(buf+ret, "DSR|"); - if (self->settings.dce & IRCOMM_CD) - ret += sprintf(buf+ret, "CD|"); - if (self->settings.dce & IRCOMM_RI) - ret += sprintf(buf+ret, "RI|"); - if (self->settings.dce) - ret--; /* remove the last | */ - ret += sprintf(buf+ret, "\n"); + seq_puts(m, "DCE status:"); + sep = ' '; + if (self->settings.dce & IRCOMM_CTS) { + seq_printf(m, "%cCTS", sep); + sep = '|'; + } + if (self->settings.dce & IRCOMM_DSR) { + seq_printf(m, "%cDSR", sep); + sep = '|'; + } + if (self->settings.dce & IRCOMM_CD) { + seq_printf(m, "%cCD", sep); + sep = '|'; + } + if (self->settings.dce & IRCOMM_RI) { + seq_printf(m, "%cRI", sep); + sep = '|'; + } + seq_putc(m, '\n'); - ret += sprintf(buf+ret, "Configuration: "); + seq_puts(m, "Configuration: "); if (!self->settings.null_modem) - ret += sprintf(buf+ret, "DTE <-> DCE\n"); + seq_puts(m, "DTE <-> DCE\n"); else - ret += sprintf(buf+ret, - "DTE <-> DTE (null modem emulation)\n"); + seq_puts(m, "DTE <-> DTE (null modem emulation)\n"); - ret += sprintf(buf+ret, "Data rate: %d\n", self->settings.data_rate); + seq_printf(m, "Data rate: %d\n", self->settings.data_rate); - ret += sprintf(buf+ret, "Flow control: "); - if (self->settings.flow_control & IRCOMM_XON_XOFF_IN) - ret += sprintf(buf+ret, "XON_XOFF_IN|"); - if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT) - ret += sprintf(buf+ret, "XON_XOFF_OUT|"); - if (self->settings.flow_control & IRCOMM_RTS_CTS_IN) - ret += sprintf(buf+ret, "RTS_CTS_IN|"); - if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT) - ret += sprintf(buf+ret, "RTS_CTS_OUT|"); - if (self->settings.flow_control & IRCOMM_DSR_DTR_IN) - ret += sprintf(buf+ret, "DSR_DTR_IN|"); - if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT) - ret += sprintf(buf+ret, "DSR_DTR_OUT|"); - if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN) - ret += sprintf(buf+ret, "ENQ_ACK_IN|"); - if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT) - ret += sprintf(buf+ret, "ENQ_ACK_OUT|"); - if (self->settings.flow_control) - ret--; /* remove the last | */ - ret += sprintf(buf+ret, "\n"); + seq_puts(m, "Flow control:"); + sep = ' '; + if (self->settings.flow_control & IRCOMM_XON_XOFF_IN) { + seq_printf(m, "%cXON_XOFF_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT) { + seq_printf(m, "%cXON_XOFF_OUT", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_RTS_CTS_IN) { + seq_printf(m, "%cRTS_CTS_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT) { + seq_printf(m, "%cRTS_CTS_OUT", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_DSR_DTR_IN) { + seq_printf(m, "%cDSR_DTR_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT) { + seq_printf(m, "%cDSR_DTR_OUT", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN) { + seq_printf(m, "%cENQ_ACK_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT) { + seq_printf(m, "%cENQ_ACK_OUT", sep); + sep = '|'; + } + seq_putc(m, '\n'); - ret += sprintf(buf+ret, "Flags: "); - if (self->flags & ASYNC_CTS_FLOW) - ret += sprintf(buf+ret, "ASYNC_CTS_FLOW|"); - if (self->flags & ASYNC_CHECK_CD) - ret += sprintf(buf+ret, "ASYNC_CHECK_CD|"); - if (self->flags & ASYNC_INITIALIZED) - ret += sprintf(buf+ret, "ASYNC_INITIALIZED|"); - if (self->flags & ASYNC_LOW_LATENCY) - ret += sprintf(buf+ret, "ASYNC_LOW_LATENCY|"); - if (self->flags & ASYNC_CLOSING) - ret += sprintf(buf+ret, "ASYNC_CLOSING|"); - if (self->flags & ASYNC_NORMAL_ACTIVE) - ret += sprintf(buf+ret, "ASYNC_NORMAL_ACTIVE|"); - if (self->flags) - ret--; /* remove the last | */ - ret += sprintf(buf+ret, "\n"); + seq_puts(m, "Flags:"); + sep = ' '; + if (self->flags & ASYNC_CTS_FLOW) { + seq_printf(m, "%cASYNC_CTS_FLOW", sep); + sep = '|'; + } + if (self->flags & ASYNC_CHECK_CD) { + seq_printf(m, "%cASYNC_CHECK_CD", sep); + sep = '|'; + } + if (self->flags & ASYNC_INITIALIZED) { + seq_printf(m, "%cASYNC_INITIALIZED", sep); + sep = '|'; + } + if (self->flags & ASYNC_LOW_LATENCY) { + seq_printf(m, "%cASYNC_LOW_LATENCY", sep); + sep = '|'; + } + if (self->flags & ASYNC_CLOSING) { + seq_printf(m, "%cASYNC_CLOSING", sep); + sep = '|'; + } + if (self->flags & ASYNC_NORMAL_ACTIVE) { + seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep); + sep = '|'; + } + seq_putc(m, '\n'); - ret += sprintf(buf+ret, "Role: %s\n", self->client ? - "client" : "server"); - ret += sprintf(buf+ret, "Open count: %d\n", self->open_count); - ret += sprintf(buf+ret, "Max data size: %d\n", self->max_data_size); - ret += sprintf(buf+ret, "Max header size: %d\n", self->max_header_size); + seq_printf(m, "Role: %s\n", self->client ? "client" : "server"); + seq_printf(m, "Open count: %d\n", self->open_count); + seq_printf(m, "Max data size: %d\n", self->max_data_size); + seq_printf(m, "Max header size: %d\n", self->max_header_size); if (self->tty) - ret += sprintf(buf+ret, "Hardware: %s\n", + seq_printf(m, "Hardware: %s\n", self->tty->hw_stopped ? "Stopped" : "Running"); - - ret += sprintf(buf+ret, "\n"); - return ret; } - -/* - * Function ircomm_tty_read_proc (buf, start, offset, len, eof, unused) - * - * - * - */ -static int ircomm_tty_read_proc(char *buf, char **start, off_t offset, int len, - int *eof, void *unused) +static int ircomm_tty_proc_show(struct seq_file *m, void *v) { struct ircomm_tty_cb *self; - int count = 0, l; - off_t begin = 0; unsigned long flags; spin_lock_irqsave(&ircomm_tty->hb_spinlock, flags); self = (struct ircomm_tty_cb *) hashbin_get_first(ircomm_tty); - while ((self != NULL) && (count < 4000)) { + while (self != NULL) { if (self->magic != IRCOMM_TTY_MAGIC) break; - l = ircomm_tty_line_info(self, buf + count); - count += l; - if (count+begin > offset+len) - goto done; - if (count+begin < offset) { - begin += count; - count = 0; - } - + ircomm_tty_line_info(self, m); self = (struct ircomm_tty_cb *) hashbin_get_next(ircomm_tty); } - *eof = 1; -done: spin_unlock_irqrestore(&ircomm_tty->hb_spinlock, flags); - - if (offset >= count+begin) - return 0; - *start = buf + (offset-begin); - return ((len < begin+count-offset) ? len : begin+count-offset); + return 0; } + +static int ircomm_tty_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ircomm_tty_proc_show, NULL); +} + +static const struct file_operations ircomm_tty_proc_fops = { + .owner = THIS_MODULE, + .open = ircomm_tty_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif /* CONFIG_PROC_FS */ MODULE_AUTHOR("Dag Brattli "); From d594027d62808f6649b273544ab8c19ed5b98fd1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:23 -0700 Subject: [PATCH 305/478] proc tty: switch amiserial to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/amiserial.c | 62 ++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c index a58869ea8513..fd3ebd1be570 100644 --- a/drivers/char/amiserial.c +++ b/drivers/char/amiserial.c @@ -79,6 +79,7 @@ static char *serial_version = "4.30"; #include #include #include +#include #include #include #include @@ -1825,14 +1826,13 @@ static int rs_open(struct tty_struct *tty, struct file * filp) * /proc fs routines.... */ -static inline int line_info(char *buf, struct serial_state *state) +static inline void line_info(struct seq_file *m, struct serial_state *state) { struct async_struct *info = state->info, scr_info; char stat_buf[30], control, status; - int ret; unsigned long flags; - ret = sprintf(buf, "%d: uart:amiga_builtin",state->line); + seq_printf(m, "%d: uart:amiga_builtin",state->line); /* * Figure out the current RS-232 lines @@ -1864,55 +1864,49 @@ static inline int line_info(char *buf, struct serial_state *state) strcat(stat_buf, "|CD"); if (info->quot) { - ret += sprintf(buf+ret, " baud:%d", - state->baud_base / info->quot); + seq_printf(m, " baud:%d", state->baud_base / info->quot); } - ret += sprintf(buf+ret, " tx:%d rx:%d", - state->icount.tx, state->icount.rx); + seq_printf(m, " tx:%d rx:%d", state->icount.tx, state->icount.rx); if (state->icount.frame) - ret += sprintf(buf+ret, " fe:%d", state->icount.frame); + seq_printf(m, " fe:%d", state->icount.frame); if (state->icount.parity) - ret += sprintf(buf+ret, " pe:%d", state->icount.parity); + seq_printf(m, " pe:%d", state->icount.parity); if (state->icount.brk) - ret += sprintf(buf+ret, " brk:%d", state->icount.brk); + seq_printf(m, " brk:%d", state->icount.brk); if (state->icount.overrun) - ret += sprintf(buf+ret, " oe:%d", state->icount.overrun); + seq_printf(m, " oe:%d", state->icount.overrun); /* * Last thing is the RS-232 status lines */ - ret += sprintf(buf+ret, " %s\n", stat_buf+1); - return ret; + seq_printf(m, " %s\n", stat_buf+1); } -static int rs_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int rs_proc_show(struct seq_file *m, void *v) { - int len = 0, l; - off_t begin = 0; - - len += sprintf(page, "serinfo:1.0 driver:%s\n", serial_version); - l = line_info(page + len, &rs_table[0]); - len += l; - if (len+begin > off+count) - goto done; - if (len+begin < off) { - begin += len; - len = 0; - } - *eof = 1; -done: - if (off >= len+begin) - return 0; - *start = page + (off-begin); - return ((count < begin+len-off) ? count : begin+len-off); + seq_printf(m, "serinfo:1.0 driver:%s\n", serial_version); + line_info(m, &rs_table[0]); + return 0; } +static int rs_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rs_proc_show, NULL); +} + +static const struct file_operations rs_proc_fops = { + .owner = THIS_MODULE, + .open = rs_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * --------------------------------------------------------------------- * rs_init() and friends @@ -1951,9 +1945,9 @@ static const struct tty_operations serial_ops = { .break_ctl = rs_break, .send_xchar = rs_send_xchar, .wait_until_sent = rs_wait_until_sent, - .read_proc = rs_read_proc, .tiocmget = rs_tiocmget, .tiocmset = rs_tiocmset, + .proc_fops = &rs_proc_fops, }; /* From bf54215ef86a1bd83affd8ecdf833c053aefb49d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:23 -0700 Subject: [PATCH 306/478] proc tty: switch ia64 simserial to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/hp/sim/simserial.c | 47 ++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c index 24b1ad5334cb..2bef5261d96d 100644 --- a/arch/ia64/hp/sim/simserial.c +++ b/arch/ia64/hp/sim/simserial.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -848,38 +849,36 @@ static int rs_open(struct tty_struct *tty, struct file * filp) * /proc fs routines.... */ -static inline int line_info(char *buf, struct serial_state *state) +static inline void line_info(struct seq_file *m, struct serial_state *state) { - return sprintf(buf, "%d: uart:%s port:%lX irq:%d\n", + seq_printf(m, "%d: uart:%s port:%lX irq:%d\n", state->line, uart_config[state->type].name, state->port, state->irq); } -static int rs_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int rs_proc_show(struct seq_file *m, void *v) { - int i, len = 0, l; - off_t begin = 0; + int i; - len += sprintf(page, "simserinfo:1.0 driver:%s\n", serial_version); - for (i = 0; i < NR_PORTS && len < 4000; i++) { - l = line_info(page + len, &rs_table[i]); - len += l; - if (len+begin > off+count) - goto done; - if (len+begin < off) { - begin += len; - len = 0; - } - } - *eof = 1; -done: - if (off >= len+begin) - return 0; - *start = page + (begin-off); - return ((count < begin+len-off) ? count : begin+len-off); + seq_printf(m, "simserinfo:1.0 driver:%s\n", serial_version); + for (i = 0; i < NR_PORTS; i++) + line_info(m, &rs_table[i]); + return 0; } +static int rs_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rs_proc_show, NULL); +} + +static const struct file_operations rs_proc_fops = { + .owner = THIS_MODULE, + .open = rs_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * --------------------------------------------------------------------- * rs_init() and friends @@ -917,7 +916,7 @@ static const struct tty_operations hp_ops = { .start = rs_start, .hangup = rs_hangup, .wait_until_sent = rs_wait_until_sent, - .read_proc = rs_read_proc, + .proc_fops = &rs_proc_fops, }; /* From 140716934f67a9b28c3f7032c07c20c746d97a31 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:24 -0700 Subject: [PATCH 307/478] proc tty: switch xtensa iss console to ->proc_fops Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/xtensa/platforms/iss/console.c | 31 ++++++++++++++++------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c index 25d46c84eb08..4c559cf7da2d 100644 --- a/arch/xtensa/platforms/iss/console.c +++ b/arch/xtensa/platforms/iss/console.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -176,22 +177,24 @@ static void rs_wait_until_sent(struct tty_struct *tty, int timeout) /* Stub, once again.. */ } -static int rs_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int rs_proc_show(struct seq_file *m, void *v) { - int len = 0; - off_t begin = 0; - - len += sprintf(page, "serinfo:1.0 driver:%s\n", serial_version); - *eof = 1; - - if (off >= len + begin) - return 0; - - *start = page + (off - begin); - return ((count < begin + len - off) ? count : begin + len - off); + seq_printf(m, "serinfo:1.0 driver:%s\n", serial_version); + return 0; } +static int rs_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rs_proc_show, NULL); +} + +static const struct file_operations rs_proc_fops = { + .owner = THIS_MODULE, + .open = rs_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; static struct tty_operations serial_ops = { .open = rs_open, @@ -203,7 +206,7 @@ static struct tty_operations serial_ops = { .chars_in_buffer = rs_chars_in_buffer, .hangup = rs_hangup, .wait_until_sent = rs_wait_until_sent, - .read_proc = rs_read_proc + .proc_fops = &rs_proc_fops, }; int __init rs_init(void) From 0f043a81ebe84be3576667f04fdda481609e3816 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:19:25 -0700 Subject: [PATCH 308/478] proc tty: remove struct tty_operations::read_proc struct tty_operations::proc_fops took it's place and there is one less create_proc_read_entry() user now! Signed-off-by: Alexey Dobriyan Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/isdn/capi/capi.c | 7 ------- fs/proc/proc_tty.c | 18 ++++-------------- include/linux/tty_driver.h | 2 -- net/bluetooth/rfcomm/tty.c | 6 ------ 4 files changed, 4 insertions(+), 29 deletions(-) diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index 3e468d2cf730..2d8352419c0d 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -1331,12 +1331,6 @@ static void capinc_tty_send_xchar(struct tty_struct *tty, char ch) #endif } -static int capinc_tty_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - return 0; -} - static struct tty_driver *capinc_tty_driver; static const struct tty_operations capinc_ops = { @@ -1358,7 +1352,6 @@ static const struct tty_operations capinc_ops = { .flush_buffer = capinc_tty_flush_buffer, .set_ldisc = capinc_tty_set_ldisc, .send_xchar = capinc_tty_send_xchar, - .read_proc = capinc_tty_read_proc, }; static int capinc_tty_init(void) diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 854827b1d463..83adcc869437 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -144,22 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver) { struct proc_dir_entry *ent; - if (!driver->driver_name || driver->proc_entry) + if (!driver->driver_name || driver->proc_entry || + !driver->ops->proc_fops) return; - if (driver->ops->proc_fops) { - ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, - driver->ops->proc_fops, driver); - if (!ent) - return; - } else if (driver->ops->read_proc) { - ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); - if (!ent) - return; - ent->read_proc = driver->ops->read_proc; - ent->data = driver; - } else - return; + ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_fops, driver); driver->proc_entry = ent; } diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index c9a69575ded6..8615d661ab60 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -252,8 +252,6 @@ struct tty_operations { void (*set_ldisc)(struct tty_struct *tty); void (*wait_until_sent)(struct tty_struct *tty, int timeout); void (*send_xchar)(struct tty_struct *tty, char ch); - int (*read_proc)(char *page, char **start, off_t off, - int count, int *eof, void *data); int (*tiocmget)(struct tty_struct *tty, struct file *file); int (*tiocmset)(struct tty_struct *tty, struct file *file, unsigned int set, unsigned int clear); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index abdc703a11d2..cab71ea2796d 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -1093,11 +1093,6 @@ static void rfcomm_tty_hangup(struct tty_struct *tty) } } -static int rfcomm_tty_read_proc(char *buf, char **start, off_t offset, int len, int *eof, void *unused) -{ - return 0; -} - static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; @@ -1156,7 +1151,6 @@ static const struct tty_operations rfcomm_ops = { .send_xchar = rfcomm_tty_send_xchar, .hangup = rfcomm_tty_hangup, .wait_until_sent = rfcomm_tty_wait_until_sent, - .read_proc = rfcomm_tty_read_proc, .tiocmget = rfcomm_tty_tiocmget, .tiocmset = rfcomm_tty_tiocmset, }; From ef161a9863b045909142daea9490b067997f3dc5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 31 Mar 2009 15:19:25 -0700 Subject: [PATCH 309/478] mm: mminit_validate_memmodel_limits(): remove redundant test In case if start_pfn overlap the upper bound no need to test end_pfn again since we have it already trimmed. Signed-off-by: Cyrill Gorcunov Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 083f5b63e7a8..da432d9f0ae8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, WARN_ON_ONCE(1); *start_pfn = max_sparsemem_pfn; *end_pfn = max_sparsemem_pfn; - } - - if (*end_pfn > max_sparsemem_pfn) { + } else if (*end_pfn > max_sparsemem_pfn) { mminit_dprintk(MMINIT_WARNING, "pfnvalidation", "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", *start_pfn, *end_pfn, max_sparsemem_pfn); From d086817dc0d42f1be8db4138233d33e1dd16a956 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 31 Mar 2009 15:19:26 -0700 Subject: [PATCH 310/478] vmap: remove needless lock and list in vmap vmap's dirty_list is unused. It's for optimizing flushing. but Nick didn't write the code yet. so, we don't need it until time as it is needed. This patch removes vmap_block's dirty_list and codes related to it. Signed-off-by: MinChan Kim Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af58324c361a..fab19876b4d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -671,10 +671,7 @@ struct vmap_block { DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); union { - struct { - struct list_head free_list; - struct list_head dirty_list; - }; + struct list_head free_list; struct rcu_head rcu_head; }; }; @@ -741,7 +738,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); - INIT_LIST_HEAD(&vb->dirty_list); vb_idx = addr_to_vb_idx(va->va_start); spin_lock(&vmap_block_tree_lock); @@ -772,12 +768,7 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - spin_lock(&vb->vbq->lock); - if (!list_empty(&vb->free_list)) - list_del(&vb->free_list); - if (!list_empty(&vb->dirty_list)) - list_del(&vb->dirty_list); - spin_unlock(&vb->vbq->lock); + BUG_ON(!list_empty(&vb->free_list)); vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); @@ -862,11 +853,7 @@ static void vb_free(const void *addr, unsigned long size) spin_lock(&vb->lock); bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); - if (!vb->dirty) { - spin_lock(&vb->vbq->lock); - list_add(&vb->dirty_list, &vb->vbq->dirty); - spin_unlock(&vb->vbq->lock); - } + vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free || !list_empty(&vb->free_list)); From a12888f772dab4bf5e6f73668dc4f5f6026a7014 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 31 Mar 2009 15:19:27 -0700 Subject: [PATCH 311/478] oom_kill: don't call for int_sqrt(0) There is no need to call for int_sqrt if argument is 0. Signed-off-by: Cyrill Gorcunov Cc: Pekka Enberg Cc: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 40ba05061a4f..d3b9bac085b5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock); unsigned long badness(struct task_struct *p, unsigned long uptime) { - unsigned long points, cpu_time, run_time, s; + unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; @@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) else run_time = 0; - s = int_sqrt(cpu_time); - if (s) - points /= s; - s = int_sqrt(int_sqrt(run_time)); - if (s) - points /= s; + if (cpu_time) + points /= int_sqrt(cpu_time); + if (run_time) + points /= int_sqrt(int_sqrt(run_time)); /* * Niced processes are most likely less important, so double From 9de1581e75ba9d7979766d4ab6d56f0f2d87f7c6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Mar 2009 15:19:29 -0700 Subject: [PATCH 312/478] get_mm_hiwater_xxx: trivial, s/define/inline/ Andrew pointed out get_mm_hiwater_xxx() evaluate "mm" argument thrice/twice, make them inline. Signed-off-by: Oleg Nesterov Cc: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 29df6374d2de..481fad3a9b42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -391,8 +391,15 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) -#define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm)) -#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm) +static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) +{ + return max(mm->hiwater_rss, get_mm_rss(mm)); +} + +static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) +{ + return max(mm->hiwater_vm, mm->total_vm); +} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); From a6dc60f8975ad96d162915e07703a4439c80dcf0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:19:30 -0700 Subject: [PATCH 313/478] vmscan: rename sc.may_swap to may_unmap sc.may_swap does not only influence reclaiming of anon pages but pages mapped into pagetables in general, which also includes mapped file pages. In shrink_page_list(): if (!sc->may_swap && page_mapped(page)) goto keep_locked; For anon pages, this makes sense as they are always mapped and reclaiming them always requires swapping. But mapped file pages are skipped here as well and it has nothing to do with swapping. The real effect of the knob is whether mapped pages are unmapped and reclaimed or not. Rename it to `may_unmap' to have its name match its actual meaning more precisely. Signed-off-by: Johannes Weiner Reviewed-by: MinChan Kim Reviewed-by: KOSAKI Motohiro Cc: Lee Schermerhorn Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 479e46719394..1bca60f0c527 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -60,8 +60,8 @@ struct scan_control { int may_writepage; - /* Can pages be swapped as part of reclaim? */ - int may_swap; + /* Can mapped pages be reclaimed? */ + int may_unmap; /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. @@ -606,7 +606,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(!page_evictable(page, NULL))) goto cull_mlocked; - if (!sc->may_swap && page_mapped(page)) + if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -1694,7 +1694,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, + .may_unmap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1713,7 +1713,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, { struct scan_control sc = { .may_writepage = !laptop_mode, - .may_swap = 1, + .may_unmap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, @@ -1723,7 +1723,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, struct zonelist *zonelist; if (noswap) - sc.may_swap = 0; + sc.may_unmap = 0; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -1762,7 +1762,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 1, + .may_unmap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, @@ -2110,7 +2110,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) struct reclaim_state reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 0, + .may_unmap = 0, .swap_cluster_max = nr_pages, .may_writepage = 1, .isolate_pages = isolate_pages_global, @@ -2147,7 +2147,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) - sc.may_swap = 1; + sc.may_unmap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { unsigned long nr_to_scan = nr_pages - ret; @@ -2290,7 +2290,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, From ee99c71c59f897436ec65debb99372b3146f9985 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:19:31 -0700 Subject: [PATCH 314/478] mm: introduce for_each_populated_zone() macro Impact: cleanup In almost cases, for_each_zone() is used with populated_zone(). It's because almost function doesn't need memoryless node information. Therefore, for_each_populated_zone() can help to make code simplify. This patch has no functional change. [akpm@linux-foundation.org: small cleanup] Signed-off-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Reviewed-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 ++++++++ kernel/power/snapshot.c | 9 +++------ kernel/power/swsusp.c | 17 ++++++++--------- mm/page_alloc.c | 26 +++++--------------------- mm/vmscan.c | 4 +--- mm/vmstat.c | 11 ++--------- 6 files changed, 27 insertions(+), 48 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1aca6cebbb78..26ef24076b76 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -806,6 +806,14 @@ extern struct zone *next_zone(struct zone *zone); zone; \ zone = next_zone(zone)) +#define for_each_populated_zone(zone) \ + for (zone = (first_online_pgdat())->node_zones; \ + zone; \ + zone = next_zone(zone)) \ + if (!populated_zone(zone)) \ + ; /* do nothing */ \ + else + static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f5fc2d7680f2..33e2e4a819f9 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) INIT_LIST_HEAD(list); - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long zone_start, zone_end; struct mem_extent *ext, *cur, *aux; - if (!populated_zone(zone)) - continue; - zone_start = zone->zone_start_pfn; zone_end = zone->zone_start_pfn + zone->spanned_pages; @@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void) struct zone *zone; unsigned int cnt = 0; - for_each_zone(zone) - if (populated_zone(zone) && is_highmem(zone)) + for_each_populated_zone(zone) + if (is_highmem(zone)) cnt += zone_page_state(zone, NR_FREE_PAGES); return cnt; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index a92c91451559..1ee6636414b2 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -229,17 +229,16 @@ int swsusp_shrink_memory(void) size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; tmp = size; size += highmem_size; - for_each_zone (zone) - if (populated_zone(zone)) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= + for_each_populated_zone(zone) { + tmp += snapshot_additional_pages(zone); + if (is_highmem(zone)) { + highmem_size -= zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } + } else { + tmp -= zone_page_state(zone, NR_FREE_PAGES); + tmp += zone->lowmem_reserve[ZONE_NORMAL]; } + } if (highmem_size < 0) highmem_size = 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3803ea8c27d..cbd532161f68 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu) unsigned long flags; struct zone *zone; - for_each_zone(zone) { + for_each_populated_zone(zone) { struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - if (!populated_zone(zone)) - continue; - pset = zone_pcp(zone, cpu); pcp = &pset->pcp; @@ -1879,10 +1876,7 @@ void show_free_areas(void) int cpu; struct zone *zone; - for_each_zone(zone) { - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -1922,12 +1916,9 @@ void show_free_areas(void) global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); - for_each_zone(zone) { + for_each_populated_zone(zone) { int i; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s" " free:%lukB" @@ -1967,12 +1958,9 @@ void show_free_areas(void) printk("\n"); } - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s: ", zone->name); @@ -2784,11 +2772,7 @@ static int __cpuinit process_zones(int cpu) node_set_state(node, N_CPU); /* this node has a cpu */ - for_each_zone(zone) { - - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, node); if (!zone_pcp(zone, cpu)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1bca60f0c527..301f057fd115 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2061,11 +2061,9 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, struct zone *zone; unsigned long ret = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { enum lru_list l; - if (!populated_zone(zone)) - continue; if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; diff --git a/mm/vmstat.c b/mm/vmstat.c index 8cd81ea1ddc1..9826766f1274 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void) int cpu; int threshold; - for_each_zone(zone) { - - if (!zone->present_pages) - continue; - + for_each_populated_zone(zone) { threshold = calculate_threshold(zone); for_each_online_cpu(cpu) @@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu) int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; - for_each_zone(zone) { + for_each_populated_zone(zone) { struct per_cpu_pageset *p; - if (!populated_zone(zone)) - continue; - p = zone_pcp(zone, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) From 0a0dd05dd7e1a800241888cbf515bf8d3dc2e59c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:19:33 -0700 Subject: [PATCH 315/478] mm: don't call mark_page_accessed() in do_swap_page() commit bf3f3bc5e734706730c12a323f9b2068052aa1f0 (mm: don't mark_page_accessed in fault path) only remove the mark_page_accessed() in filemap_fault(). Therefore, swap-backed pages and file-backed pages have inconsistent behavior. mark_page_accessed() should be removed from do_swap_page(). Signed-off-by: KOSAKI Motohiro Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2032ad2fc34b..0017111214c5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2435,8 +2435,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGMAJFAULT); } - mark_page_accessed(page); - lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); From d979677c4c02f0a72db5a03ecd8184bd9d6695c8 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 31 Mar 2009 15:19:34 -0700 Subject: [PATCH 316/478] mm: shrink_all_memory(): use sc.nr_reclaimed Commit a79311c14eae4bb946a97af25f3e1b17d625985d "vmscan: bail out of direct reclaim after swap_cluster_max pages" moved the nr_reclaimed counter into the scan control to accumulate the number of all reclaimed pages in a reclaim invocation. shrink_all_memory() can use the same mechanism. it increase code consistency and redability. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: MinChan Kim Signed-off-by: KOSAKI Motohiro Signed-off-by: Johannes Weiner Cc: "Rafael J. Wysocki" Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 301f057fd115..b15dcbb9e174 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2050,16 +2050,15 @@ unsigned long global_lru_pages(void) #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority, and returns the - * number of reclaimed pages + * from LRU lists system-wide, for given pass and priority. * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, +static void shrink_all_zones(unsigned long nr_pages, int prio, int pass, struct scan_control *sc) { struct zone *zone; - unsigned long ret = 0; + unsigned long nr_reclaimed = 0; for_each_populated_zone(zone) { enum lru_list l; @@ -2082,14 +2081,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, zone->lru[l].nr_scan = 0; nr_to_scan = min(nr_pages, lru_pages); - ret += shrink_list(l, nr_to_scan, zone, + nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); - if (ret >= nr_pages) - return ret; + if (nr_reclaimed >= nr_pages) { + sc->nr_reclaimed = nr_reclaimed; + return; + } } } } - return ret; + sc->nr_reclaimed = nr_reclaimed; } /* @@ -2103,7 +2104,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, unsigned long shrink_all_memory(unsigned long nr_pages) { unsigned long lru_pages, nr_slab; - unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; struct scan_control sc = { @@ -2125,8 +2125,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!reclaim_state.reclaimed_slab) break; - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; nr_slab -= reclaim_state.reclaimed_slab; @@ -2148,18 +2148,18 @@ unsigned long shrink_all_memory(unsigned long nr_pages) sc.may_unmap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - ret; + unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (ret >= nr_pages) + shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (sc.nr_reclaimed >= nr_pages) goto out; reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) @@ -2168,21 +2168,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages) } /* - * If ret = 0, we could not shrink LRUs, but there may be something - * in slab caches + * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be + * something in slab caches */ - if (!ret) { + if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + } while (sc.nr_reclaimed < nr_pages && + reclaim_state.reclaimed_slab > 0); } + out: current->reclaim_state = NULL; - return ret; + return sc.nr_reclaimed; } #endif From 9786bf841da57fac3457a1dac41acb4c1f2eced6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:19:35 -0700 Subject: [PATCH 317/478] vmscan: clip swap_cluster_max in shrink_all_memory() shrink_inactive_list() scans in sc->swap_cluster_max chunks until it hits the scan limit it was passed. shrink_inactive_list() { do { isolate_pages(swap_cluster_max) shrink_page_list() } while (nr_scanned < max_scan); } This assumes that swap_cluster_max is not bigger than the scan limit because the latter is checked only after at least one iteration. In shrink_all_memory() sc->swap_cluster_max is initialized to the overall reclaim goal in the beginning but not decreased while reclaim is making progress which leads to subsequent calls to shrink_inactive_list() reclaiming way too much in the one iteration that is done unconditionally. Set sc->swap_cluster_max always to the proper goal before doing shrink_all_zones() shrink_list() shrink_inactive_list(). While the current shrink_all_memory() happily reclaims more than actually requested, this patch fixes it to never exceed the goal: unpatched wanted=10000 reclaimed=13356 wanted=10000 reclaimed=19711 wanted=10000 reclaimed=10289 wanted=10000 reclaimed=17306 wanted=10000 reclaimed=10700 wanted=10000 reclaimed=10004 wanted=10000 reclaimed=13301 wanted=10000 reclaimed=10976 wanted=10000 reclaimed=10605 wanted=10000 reclaimed=10088 wanted=10000 reclaimed=15000 patched wanted=10000 reclaimed=10000 wanted=10000 reclaimed=9599 wanted=10000 reclaimed=8476 wanted=10000 reclaimed=8326 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=9919 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=9624 wanted=10000 reclaimed=10000 wanted=10000 reclaimed=10000 wanted=8500 reclaimed=8092 wanted=316 reclaimed=316 Signed-off-by: Johannes Weiner Reviewed-by: MinChan Kim Acked-by: Nigel Cunningham Acked-by: "Rafael J. Wysocki" Reviewed-by: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index b15dcbb9e174..9578063cd943 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2109,7 +2109,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 0, - .swap_cluster_max = nr_pages, .may_writepage = 1, .isolate_pages = isolate_pages_global, }; @@ -2151,6 +2150,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; sc.nr_scanned = 0; + sc.swap_cluster_max = nr_to_scan; shrink_all_zones(nr_to_scan, prio, pass, &sc); if (sc.nr_reclaimed >= nr_pages) goto out; From bd775c42ea5f7c766d03a287083837cf05e7e738 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:19:37 -0700 Subject: [PATCH 318/478] mm: add comment why mark_page_accessed() would be better than pte_mkyoung() in follow_page() At first look, mark_page_accessed() in follow_page() seems a bit strange. It seems pte_mkyoung() would be better consistent with other kernel code. However, it is intentional. The commit log said: ------------------------------------------------ commit 9e45f61d69be9024a2e6bef3831fb04d90fac7a8 Author: akpm Date: Fri Aug 15 07:24:59 2003 +0000 [PATCH] Use mark_page_accessed() in follow_page() Touching a page via follow_page() counts as a reference so we should be either setting the referenced bit in the pte or running mark_page_accessed(). Altering the pte is tricky because we haven't implemented an atomic pte_mkyoung(). And mark_page_accessed() is better anyway because it has more aging state: it can move the page onto the active list. BKrev: 3f3c8acbplT8FbwBVGtth7QmnqWkIw ------------------------------------------------ The atomic issue is still true nowadays. adding comment help to understand code intention and it would be better. [akpm@linux-foundation.org: clarify text] Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 0017111214c5..5b4ad5e4f98d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1151,6 +1151,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ mark_page_accessed(page); } unlock: From bd2f6199cf9af472aeefa1b642c9f504f19e6008 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:19:38 -0700 Subject: [PATCH 319/478] vmscan: respect higher order in zone_reclaim() During page allocation, there are two stages of direct reclaim that are applied to each zone in the preferred list. The first stage using zone_reclaim() reclaims unmapped file backed pages and slab pages if over defined limits as these are cheaper to reclaim. The caller specifies the order of the target allocation but the scan control is not being correctly initialised. The impact is that the correct number of pages are being reclaimed but that lumpy reclaim is not being applied. This increases the chances of a full direct reclaim via try_to_free_pages() is required. This patch initialises the order field of the scan control as requested by the caller. [mel@csn.ul.ie: rewrote changelog] Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Cc: Rik van Riel Cc: Andy Whitcroft Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9578063cd943..51f2df04d7cf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2295,6 +2295,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, + .order = order, .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Tue, 31 Mar 2009 15:19:39 -0700 Subject: [PATCH 320/478] vfs: add/use account_page_dirtied() Add a helper function account_page_dirtied(). Use that from two callsites. reiser4 adds a function which adds a third callsite. Signed-off-by: Edward Shishkin Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 9 +-------- include/linux/mm.h | 1 + mm/page-writeback.c | 22 +++++++++++++++------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a2fd743d97cb..73abe6d8218c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -621,14 +621,7 @@ static void __set_page_dirty(struct page *page, spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/include/linux/mm.h b/include/linux/mm.h index b1ea37fc7a24..2223f8dfa568 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -834,6 +834,7 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); +void account_page_dirtied(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 40ca7cdb653e..6aa92b03c747 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1197,6 +1197,20 @@ int __set_page_dirty_no_writeback(struct page *page) return 0; } +/* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } +} + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -1226,13 +1240,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } From 8a0bdec194c21c8fdef840989d0d7b742bb5d4bc Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Tue, 31 Mar 2009 15:19:40 -0700 Subject: [PATCH 321/478] mm: fix SHM_HUGETLB to work with users in hugetlb_shm_group Fix hugetlb subsystem so that non root users belonging to hugetlb_shm_group can actually allocate hugetlb backed shm. Currently non root users cannot even map one large page using SHM_HUGETLB when they belong to the gid in /proc/sys/vm/hugetlb_shm_group. This is because allocation size is verified against RLIMIT_MEMLOCK resource limit even if the user belongs to hugetlb_shm_group. This patch 1. Fixes hugetlb subsystem so that users with CAP_IPC_LOCK and users belonging to hugetlb_shm_group don't need to be restricted with RLIMIT_MEMLOCK resource limits 2. This patch also disables mlock based rlimit checking (which will be reinstated and marked deprecated in a subsequent patch). Signed-off-by: Ravikiran Thirumalai Reviewed-by: Mel Gorman Cc: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9b800d97a687..bc56df8ce001 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -943,9 +943,7 @@ static struct vfsmount *hugetlbfs_vfsmount; static int can_do_hugetlb_shm(void) { - return likely(capable(CAP_IPC_LOCK) || - in_group_p(sysctl_hugetlb_shm_group) || - can_do_mlock()); + return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) @@ -963,9 +961,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) if (!can_do_hugetlb_shm()) return ERR_PTR(-EPERM); - if (!user_shm_lock(size, user)) - return ERR_PTR(-ENOMEM); - root = hugetlbfs_vfsmount->mnt_root; quick_string.name = name; quick_string.len = strlen(quick_string.name); @@ -1004,7 +999,6 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - user_shm_unlock(size, user); return ERR_PTR(error); } From 2584e517320bd48dc8d20e38a2621a2dbe58fade Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Tue, 31 Mar 2009 15:21:26 -0700 Subject: [PATCH 322/478] mm: reintroduce and deprecate rlimit based access for SHM_HUGETLB Allow non root users with sufficient mlock rlimits to be able to allocate hugetlb backed shm for now. Deprecate this though. This is being deprecated because the mlock based rlimit checks for SHM_HUGETLB is not consistent with mmap based huge page allocations. Signed-off-by: Ravikiran Thirumalai Reviewed-by: Mel Gorman Cc: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 12 ++++++++++++ fs/hugetlbfs/inode.c | 13 +++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 5e02b83ac12b..ea7d1bdad34d 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -311,6 +311,18 @@ Who: Vlad Yasevich --------------------------- +What: Ability for non root users to shm_get hugetlb pages based on mlock + resource limits +When: 2.6.31 +Why: Non root users need to be part of /proc/sys/vm/hugetlb_shm_group or + have CAP_IPC_LOCK to be able to allocate shm segments backed by + huge pages. The mlock based rlimit check to allow shm hugetlb is + inconsistent with mmap based allocations. Hence it is being + deprecated. +Who: Ravikiran Thirumalai + +--------------------------- + What: CONFIG_THERMAL_HWMON When: January 2009 Why: This option was introduced just to allow older lm-sensors userspace diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index bc56df8ce001..23a3c76711e0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -949,6 +949,7 @@ static int can_do_hugetlb_shm(void) struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) { int error = -ENOMEM; + int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; @@ -958,8 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (!can_do_hugetlb_shm()) - return ERR_PTR(-EPERM); + if (!can_do_hugetlb_shm()) { + if (user_shm_lock(size, user)) { + unlock_shm = 1; + WARN_ONCE(1, + "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + } else + return ERR_PTR(-EPERM); + } root = hugetlbfs_vfsmount->mnt_root; quick_string.name = name; @@ -999,6 +1006,8 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: + if (unlock_shm) + user_shm_unlock(size, user); return ERR_PTR(error); } From 2443462b0a04ef0f82ad48f4fd0ef4ac5b24c4b7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:23:12 -0700 Subject: [PATCH 323/478] mm: move pagevec stripping to save unlock-relock In shrink_active_list() after the deactivation loop, we strip buffer heads from the potentially remaining pages in the pagevec. Currently, this drops the zone's lru lock for stripping, only to reacquire it again afterwards to update statistics. It is not necessary to strip the pages before updating the stats, so move the whole thing out of the protected region and save the extra locking. Signed-off-by: Johannes Weiner Reviewed-by: MinChan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 51f2df04d7cf..988aef933016 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1298,14 +1298,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgdeactivate += pgmoved; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - pagevec_strip(&pvec); - spin_lock_irq(&zone->lru_lock); - } __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); if (vm_swap_full()) pagevec_swap_free(&pvec); From ad1c3544d0a85da7738ce8cff6f8a148da57935c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:23:13 -0700 Subject: [PATCH 324/478] mm: don't free swap slots on page deactivation The pagevec_swap_free() at the end of shrink_active_list() was introduced in 68a22394 "vmscan: free swap space on swap-in/activation" when shrink_active_list() was still rotating referenced active pages. In 7e9cd48 "vmscan: fix pagecache reclaim referenced bit check" this was changed, the rotating removed but the pagevec_swap_free() after the rotation loop was forgotten, applying now to the pagevec of the deactivation loop instead. Now swap space is freed for deactivated pages. And only for those that happen to be on the pagevec after the deactivation loop. Complete 7e9cd48 and remove the rest of the swap freeing. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 988aef933016..e70fae31e968 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1303,9 +1303,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_unlock_irq(&zone->lru_lock); if (buffer_heads_over_limit) pagevec_strip(&pvec); - if (vm_swap_full()) - pagevec_swap_free(&pvec); - pagevec_release(&pvec); } From d1d7487173eab8352125cf6cc271940f24254bd4 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Mar 2009 15:23:14 -0700 Subject: [PATCH 325/478] mm: remove pagevec_swap_free() pagevec_swap_free() is now unused. Signed-off-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Rik van Riel Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 1 - mm/swap.c | 23 ----------------------- 2 files changed, 24 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 7b2886fa7fdc..bab82f4c571c 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -24,7 +24,6 @@ void __pagevec_release(struct pagevec *pvec); void __pagevec_free(struct pagevec *pvec); void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); void pagevec_strip(struct pagevec *pvec); -void pagevec_swap_free(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, diff --git a/mm/swap.c b/mm/swap.c index 8adb9feb61e1..6e83084c1f6c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -456,29 +456,6 @@ void pagevec_strip(struct pagevec *pvec) } } -/** - * pagevec_swap_free - try to free swap space from the pages in a pagevec - * @pvec: pagevec with swapcache pages to free the swap space of - * - * The caller needs to hold an extra reference to each page and - * not hold the page lock on the pages. This function uses a - * trylock on the page lock so it may not always free the swap - * space associated with a page. - */ -void pagevec_swap_free(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - if (PageSwapCache(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); - } - } -} - /** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed From e2f17d9459aeccf4e013e31cbd741d6b1858eec4 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:23:15 -0700 Subject: [PATCH 326/478] hugetlb: chg cannot become less than 0 chg is unsigned, so it cannot be less than 0. Also, since region_chg returns long, let vma_needs_reservation() forward this to alloc_huge_page(). Store it as long as well. all callers cast it to long anyway. Signed-off-by: Roel Kluin Cc: Andy Whitcroft Cc: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 107da3d809a8..28c655ba9353 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h, * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct hstate *h, +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; @@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h, return 1; } else { - int err; + long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); @@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg; + long chg; /* * Processes that did not create the mapping will have no reserves and From 610a77e04a8d9fe8764dc484e2182fa251ce1cc2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 31 Mar 2009 15:23:16 -0700 Subject: [PATCH 327/478] memdup_user(): introduce I notice there are many places doing copy_from_user() which follows kmalloc(): dst = kmalloc(len, GFP_KERNEL); if (!dst) return -ENOMEM; if (copy_from_user(dst, src, len)) { kfree(dst); return -EFAULT } memdup_user() is a wrapper of the above code. With this new function, we don't have to write 'len' twice, which can lead to typos/mistakes. It also produces smaller code and kernel text. A quick grep shows 250+ places where memdup_user() *may* be used. I'll prepare a patchset to do this conversion. Signed-off-by: Li Zefan Cc: KOSAKI Motohiro Cc: Americo Wang Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 1 + mm/util.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index d18fc198aa2f..8852739f36df 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -12,6 +12,7 @@ #include /* for NULL */ extern char *strndup_user(const char __user *, long); +extern void *memdup_user(const void __user *, size_t); /* * Include machine specific inline routines diff --git a/mm/util.c b/mm/util.c index 37eaccdf3054..7c122e49f769 100644 --- a/mm/util.c +++ b/mm/util.c @@ -69,6 +69,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup); +/** + * memdup_user - duplicate memory region from user space + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Returns an ERR_PTR() on failure. + */ +void *memdup_user(const void __user *src, size_t len) +{ + void *p; + + /* + * Always use GFP_KERNEL, since copy_from_user() can sleep and + * cause pagefault, which makes it pointless to use GFP_NOFS + * or GFP_ATOMIC. + */ + p = kmalloc_track_caller(len, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + return p; +} +EXPORT_SYMBOL(memdup_user); + /** * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. From 6a11f75b6a17b5d9ac5025f8d048382fd1f47377 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Mar 2009 15:23:17 -0700 Subject: [PATCH 328/478] generic debug pagealloc CONFIG_DEBUG_PAGEALLOC is now supported by x86, powerpc, sparc64, and s390. This patch implements it for the rest of the architectures by filling the pages with poison byte patterns after free_pages() and verifying the poison patterns before alloc_pages(). This generic one cannot detect invalid page accesses immediately but invalid read access may cause invalid dereference by poisoned memory and invalid write access can be detected after a long delay. Signed-off-by: Akinobu Mita Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/avr32/mm/fault.c | 18 ----- arch/powerpc/Kconfig | 3 + arch/powerpc/Kconfig.debug | 1 + arch/s390/Kconfig | 3 + arch/s390/Kconfig.debug | 1 + arch/sparc/Kconfig | 3 + arch/sparc/Kconfig.debug | 3 +- arch/x86/Kconfig | 3 + arch/x86/Kconfig.debug | 1 + include/linux/mm_types.h | 4 + include/linux/page-debug-flags.h | 30 +++++++ include/linux/poison.h | 3 + lib/Kconfig.debug | 1 + mm/Kconfig.debug | 17 ++++ mm/Makefile | 1 + mm/debug-pagealloc.c | 129 +++++++++++++++++++++++++++++++ 16 files changed, 202 insertions(+), 19 deletions(-) create mode 100644 include/linux/page-debug-flags.h create mode 100644 mm/Kconfig.debug create mode 100644 mm/debug-pagealloc.c diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index ce4e4296b954..62d4abbaa654 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -250,21 +250,3 @@ asmlinkage void do_bus_error(unsigned long addr, int write_access, dump_dtlb(); die("Bus Error", regs, SIGKILL); } - -/* - * This functionality is currently not possible to implement because - * we're using segmentation to ensure a fixed mapping of the kernel - * virtual address space. - * - * It would be possible to implement this, but it would require us to - * disable segmentation at startup and load the kernel mappings into - * the TLB like any other pages. There will be lots of trickery to - * avoid recursive invocation of the TLB miss handler, though... - */ -#ifdef CONFIG_DEBUG_PAGEALLOC -void kernel_map_pages(struct page *page, int numpages, int enable) -{ - -} -EXPORT_SYMBOL(kernel_map_pages); -#endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ad6b1c084fe3..45192dce65c4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -228,6 +228,9 @@ config PPC_OF_PLATFORM_PCI depends on PPC64 # not supported on 32 bits yet default n +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 22091bbfdc9b..6aa0b5e087cd 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -30,6 +30,7 @@ config DEBUG_STACK_USAGE config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL && !HIBERNATION + depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC help Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2a8af5e16345..dcb667c4375a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -72,6 +72,9 @@ config PGSTE config VIRT_CPU_ACCOUNTING def_bool y +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y + mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index 4599fa06bd82..7e297a3cde34 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -9,6 +9,7 @@ source "lib/Kconfig.debug" config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL + depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC help Unmap pages from the kernel linear mapping after free_pages(). This results in a slowdown, but helps to find certain types of diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c3ea215334f6..cc12cd48bbc5 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -124,6 +124,9 @@ config ARCH_NO_VIRT_TO_BUS config OF def_bool y +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y if SPARC64 + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug index b8a15e271bfa..d001b42041a5 100644 --- a/arch/sparc/Kconfig.debug +++ b/arch/sparc/Kconfig.debug @@ -24,7 +24,8 @@ config STACK_DEBUG config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on SPARC64 && DEBUG_KERNEL && !HIBERNATION + depends on DEBUG_KERNEL && !HIBERNATION + depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC help Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 45161b816313..748e50a1a152 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -165,6 +165,9 @@ config AUDIT_ARCH config ARCH_SUPPORTS_OPTIMIZED_INLINING def_bool y +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fdb45df608b6..a345cb5447a8 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -75,6 +75,7 @@ config DEBUG_STACK_USAGE config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL + depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d84feb7bdbf0..ddadb4defe00 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -174,6 +175,9 @@ struct vm_area_struct { #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif +#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS + unsigned long debug_flags; /* Use atomic bitops on this */ +#endif }; struct core_thread { diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h new file mode 100644 index 000000000000..b0638fd91e92 --- /dev/null +++ b/include/linux/page-debug-flags.h @@ -0,0 +1,30 @@ +#ifndef LINUX_PAGE_DEBUG_FLAGS_H +#define LINUX_PAGE_DEBUG_FLAGS_H + +/* + * page->debug_flags bits: + * + * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to + * implement generic debug pagealloc feature. The pages are filled with + * poison patterns and set this flag after free_pages(). The poisoned + * pages are verified whether the patterns are not corrupted and clear + * the flag before alloc_pages(). + */ + +enum page_debug_flags { + PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */ +}; + +/* + * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably + * gets turned off when no debug features are enabling it! + */ + +#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS +#if !defined(CONFIG_PAGE_POISONING) \ +/* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */ +#error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features! +#endif +#endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */ + +#endif /* LINUX_PAGE_DEBUG_FLAGS_H */ diff --git a/include/linux/poison.h b/include/linux/poison.h index 9f31683728fd..6729f7dcd60e 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -17,6 +17,9 @@ */ #define TIMER_ENTRY_STATIC ((void *) 0x74737461) +/********** mm/debug-pagealloc.c **********/ +#define PAGE_POISON 0xaa + /********** mm/slab.c **********/ /* * Magic nums for obj red zoning. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 58bfe7e8faba..9638d99644af 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -796,6 +796,7 @@ config SYSCTL_SYSCALL_CHECK to properly maintain and use. This enables checks that help you to keep things correct. +source mm/Kconfig.debug source kernel/trace/Kconfig config PROVIDE_OHCI1394_DMA_INIT diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug new file mode 100644 index 000000000000..c8d62d49a44e --- /dev/null +++ b/mm/Kconfig.debug @@ -0,0 +1,17 @@ +config WANT_PAGE_DEBUG_FLAGS + bool + +config PAGE_POISONING + bool "Debug page memory allocations" + depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION + select DEBUG_PAGEALLOC + select WANT_PAGE_DEBUG_FLAGS + help + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). This results in a large slowdown, + but helps to find certain types of memory corruptions. + + This option cannot enalbe with hibernation. Otherwise, it will get + wrong messages for memory corruption because the free pages are not + saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 818569b68f46..ec73c68b6015 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_FAILSLAB) += failslab.o diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c new file mode 100644 index 000000000000..a1e3324de2b5 --- /dev/null +++ b/mm/debug-pagealloc.c @@ -0,0 +1,129 @@ +#include +#include +#include +#include + +static inline void set_page_poison(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline void clear_page_poison(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline bool page_poison(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static void poison_highpage(struct page *page) +{ + /* + * Page poisoning for highmem pages is not implemented. + * + * This can be called from interrupt contexts. + * So we need to create a new kmap_atomic slot for this + * application and it will need interrupt protection. + */ +} + +static void poison_page(struct page *page) +{ + void *addr; + + if (PageHighMem(page)) { + poison_highpage(page); + return; + } + set_page_poison(page); + addr = page_address(page); + memset(addr, PAGE_POISON, PAGE_SIZE); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(unsigned char *mem, size_t bytes) +{ + unsigned char *start; + unsigned char *end; + + for (start = mem; start < mem + bytes; start++) { + if (*start != PAGE_POISON) + break; + } + if (start == mem + bytes) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!printk_ratelimit()) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + printk(KERN_ERR "pagealloc: single bit error\n"); + else + printk(KERN_ERR "pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); +} + +static void unpoison_highpage(struct page *page) +{ + /* + * See comment in poison_highpage(). + * Highmem pages should not be poisoned for now + */ + BUG_ON(page_poison(page)); +} + +static void unpoison_page(struct page *page) +{ + if (PageHighMem(page)) { + unpoison_highpage(page); + return; + } + if (page_poison(page)) { + void *addr = page_address(page); + + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + } +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} From 704503d836042d4a4c7685b7036e7de0418fbc0f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 31 Mar 2009 15:23:18 -0700 Subject: [PATCH 329/478] mm: fix proc_dointvec_userhz_jiffies "breakage" Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9838 On i386, HZ=1000, jiffies_to_clock_t() converts time in a somewhat strange way from the user's point of view: # echo 500 >/proc/sys/vm/dirty_writeback_centisecs # cat /proc/sys/vm/dirty_writeback_centisecs 499 So, we have 5000 jiffies converted to only 499 clock ticks and reported back. TICK_NSEC = 999848 ACTHZ = 256039 Keeping in-kernel variable in units passed from userspace will fix issue of course, but this probably won't be right for every sysctl. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexey Dobriyan Cc: Peter Zijlstra Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 4 ++-- kernel/sysctl.c | 2 +- mm/page-writeback.c | 20 +++++++++++--------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 7300ecdc480c..93445477f86a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -109,8 +109,8 @@ extern int dirty_background_ratio; extern unsigned long dirty_background_bytes; extern int vm_dirty_ratio; extern unsigned long vm_dirty_bytes; -extern int dirty_writeback_interval; -extern int dirty_expire_interval; +extern unsigned int dirty_writeback_interval; +extern unsigned int dirty_expire_interval; extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c5ef44ff850f..2e490a389dd2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1010,7 +1010,7 @@ static struct ctl_table vm_table[] = { .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = &proc_dointvec_userhz_jiffies, + .proc_handler = &proc_dointvec, }, { .ctl_name = VM_NR_PDFLUSH_THREADS, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6aa92b03c747..30351f0063ac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -92,14 +92,14 @@ int vm_dirty_ratio = 20; unsigned long vm_dirty_bytes; /* - * The interval between `kupdate'-style writebacks, in jiffies + * The interval between `kupdate'-style writebacks */ -int dirty_writeback_interval = 5 * HZ; +unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ /* - * The longest number of jiffies for which data is allowed to remain dirty + * The longest time for which data is allowed to remain dirty */ -int dirty_expire_interval = 30 * HZ; +unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - oldest_jif = jiffies - dirty_expire_interval; + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); start_jif = jiffies; - next_jif = start_jif + dirty_writeback_interval; + next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); nr_to_write = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); @@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, jiffies + + msecs_to_jiffies(dirty_writeback_interval * 10)); else del_timer(&wb_timer); return 0; @@ -905,7 +906,8 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, + jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); From c2fdf3a9b2d52842808a8e551b53b55dd9b45030 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 31 Mar 2009 15:23:19 -0700 Subject: [PATCH 330/478] mm: enable hashdist by default on 64bit NUMA On PowerPC we allocate large boot time hashes on node 0. This leads to an imbalance in the free memory, for example on a 64GB box (4 x 16GB nodes): Free memory: Node 0: 97.03% Node 1: 98.54% Node 2: 98.42% Node 3: 98.53% If we switch to using vmalloc (like ia64 and x86-64) things are more balanced: Free memory: Node 0: 97.53% Node 1: 98.35% Node 2: 98.33% Node 3: 98.33% For many HPC applications we are limited by the free available memory on the smallest node, so even though the same amount of memory is used the better balancing helps. Since all 64bit NUMA capable architectures should have sufficient vmalloc space, it makes sense to enable it via CONFIG_64BIT. Signed-off-by: Anton Blanchard Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Ivan Kokshaysky Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 455d83219fae..bc3ab7073695 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -146,10 +146,10 @@ extern void *alloc_large_system_hash(const char *tablename, #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ -/* Only NUMA needs hash distribution. - * IA64 and x86_64 have sufficient vmalloc space. +/* Only NUMA needs hash distribution. 64bit NUMA architectures have + * sufficient vmalloc space. */ -#if defined(CONFIG_NUMA) && (defined(CONFIG_IA64) || defined(CONFIG_X86_64)) +#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) #define HASHDIST_DEFAULT 1 #else #define HASHDIST_DEFAULT 0 From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:21 -0700 Subject: [PATCH 331/478] mm: page_mkwrite change prototype to match fault Change the page_mkwrite prototype to take a struct vm_fault, and return VM_FAULT_xxx flags. There should be no functional change. This makes it possible to return much more detailed error information to the VM (and also can provide more information eg. virtual_address to the driver, which might be important in some special cases). This is required for a subsequent fix. And will also make it easier to merge page_mkwrite() with fault() in future. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Trond Myklebust Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Mark Fasheh Cc: Joel Becker Cc: Artem Bityutskiy Cc: Felix Blyakher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 +- drivers/video/fb_defio.c | 3 ++- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 5 ++++- fs/buffer.c | 6 +++++- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 5 ++++- fs/fuse/file.c | 3 ++- fs/gfs2/ops_file.c | 5 ++++- fs/nfs/file.c | 5 ++++- fs/ocfs2/mmap.c | 6 ++++-- fs/ubifs/file.c | 9 ++++++--- fs/xfs/linux-2.6/xfs_file.c | 4 ++-- include/linux/buffer_head.h | 2 +- include/linux/mm.h | 3 ++- mm/memory.c | 26 ++++++++++++++++++++++---- 16 files changed, 65 insertions(+), 23 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4e78ce677843..76efe5b71d7d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -505,7 +505,7 @@ prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); - int (*page_mkwrite)(struct vm_area_struct *, struct page *); + int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 082026546aee..0a7a6679ee6e 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -85,8 +85,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); /* vm_ops->page_mkwrite handler */ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { + struct page *page = vmf->page; struct fb_info *info = vma->vm_private_data; struct fb_deferred_io *fbdefio = info->fbdefio; struct page *cur; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e1d4e30e9d8..7dd1b6d0bf32 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2060,7 +2060,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..ec5423790bbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = fdentry(vma->vm_file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -4362,6 +4363,8 @@ again: out_unlock: unlock_page(page); out: + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 73abe6d8218c..6d51a3da362c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to) * unlock the page. */ int -block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; @@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page, ret = block_commit_write(page, 0, end); out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; + unlock_page(page); return ret; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6083bb38057b..990c94000924 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1098,7 +1098,7 @@ extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); /* ioctl.c */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71d3ecd5db79..dd82ff390067 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5146,8 +5146,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; loff_t size; unsigned long len; int ret = -EINVAL; @@ -5199,6 +5200,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out_unlock; ret = 0; out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; up_read(&inode->i_alloc_sem); return ret; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 821d10f719bd..4e340fedf768 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma) * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER */ -static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; /* * Don't use page->mapping as it may become NULL from a * concurrent truncate. diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 3b9e8de3500b..70b9b8548945 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page) * blocks allocated on disk to back that page. */ -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -412,6 +413,8 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 90f292b520d2..cec79392e4ba 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, }; -static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; @@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = pagelen; out_unlock: unlock_page(page); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index eea1d24713ea..b606496b72ec 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -154,8 +154,9 @@ out: return ret; } -static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct buffer_head *di_bh = NULL; sigset_t blocked, oldset; @@ -196,7 +197,8 @@ out: ret2 = ocfs2_vm_op_unblock_sigs(&oldset); if (ret2 < 0) mlog_errno(ret2); - + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 93b6de51f261..0ff89fe71e51 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) * mmap()d file has taken write protection fault and is being made * writable. UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); @@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); if (unlikely(c->ro_media)) - return -EROFS; + return VM_FAULT_SIGBUS; /* -EROFS */ /* * We have not locked @page so far so we may budget for changing the @@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (err == -ENOSPC) ubifs_warn("out of space for mmapped file " "(inode number %lu)", inode->i_ino); - return err; + return VM_FAULT_SIGBUS; } lock_page(page); @@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) out_unlock: unlock_page(page); ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; return err; } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e14c4e3aea0c..f4e255441574 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -234,9 +234,9 @@ xfs_file_mmap( STATIC int xfs_vm_page_mkwrite( struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { - return block_page_mkwrite(vma, page, xfs_get_blocks); + return block_page_mkwrite(vma, vmf, xfs_get_blocks); } const struct file_operations xfs_file_operations = { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f19fd9045ea0..3d7bcde2e332 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -216,7 +216,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); -int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2223f8dfa568..aeabe953ba4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -135,6 +135,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is @@ -187,7 +188,7 @@ struct vm_operations_struct { /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); + int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware diff --git a/mm/memory.c b/mm/memory.c index 5b4ad5e4f98d..cf6873e91c6a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + struct vm_fault vmf; + int tmp; + + vmf.virtual_address = (void __user *)(address & + PAGE_MASK); + vmf.pgoff = old_page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = old_page; + /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait @@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; goto unwritable_page; + } /* * Since we dropped the lock we need to revalidate @@ -2106,7 +2119,7 @@ oom: unwritable_page: page_cache_release(old_page); - return VM_FAULT_SIGBUS; + return ret; } /* @@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * to become writable */ if (vma->vm_ops->page_mkwrite) { + int tmp; + unlock_page(page); - if (vma->vm_ops->page_mkwrite(vma, page) < 0) { - ret = VM_FAULT_SIGBUS; + vmf.flags |= FAULT_FLAG_MKWRITE; + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; anon = 1; /* no anon but release vmf.page */ goto out_unlocked; } From 56a76f8275c379ed73c8a43cfa1dfa2f5e9cfa19 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:23 -0700 Subject: [PATCH 332/478] fs: fix page_mkwrite error cases in core code and btrfs page_mkwrite is called with neither the page lock nor the ptl held. This means a page can be concurrently truncated or invalidated out from underneath it. Callers are supposed to prevent truncate races themselves, however previously the only thing they can do in case they hit one is to raise a SIGBUS. A sigbus is wrong for the case that the page has been invalidated or truncated within i_size (eg. hole punched). Callers may also have to perform memory allocations in this path, where again, SIGBUS would be wrong. The previous patch ("mm: page_mkwrite change prototype to match fault") made it possible to properly specify errors. Convert the generic buffer.c code and btrfs to return sane error values (in the case of page removed from pagecache, VM_FAULT_NOPAGE will cause the fault handler to exit without doing anything, and the fault will be retried properly). This fixes core code, and converts btrfs as a template/example. All other filesystems defining their own page_mkwrite should be fixed in a similar manner. Acked-by: Chris Mason Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/inode.c | 11 +++++++---- fs/buffer.c | 12 ++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec5423790bbb..17e608c4dc70 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4307,10 +4307,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; goto out; + } - ret = -EINVAL; + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); size = i_size_read(inode); @@ -4363,8 +4368,6 @@ again: out_unlock: unlock_page(page); out: - if (ret) - ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 6d51a3da362c..0c14f8d52ee5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2320,7 +2320,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; - int ret = -EINVAL; + int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ lock_page(page); size = i_size_read(inode); @@ -2340,10 +2340,14 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (!ret) ret = block_commit_write(page, 0, end); -out_unlock: - if (ret) - ret = VM_FAULT_SIGBUS; + if (unlikely(ret)) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; + } +out_unlock: unlock_page(page); return ret; } From 851a039cc547b33b8139fe6d7c2bbfb158e2724e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 31 Mar 2009 15:23:24 -0700 Subject: [PATCH 333/478] mm: page_mkwrite change prototype to match fault: fix sysfs Fix warnings and return values in sysfs bin_page_mkwrite(), fixing fs/sysfs/bin.c: In function `bin_page_mkwrite': fs/sysfs/bin.c:250: warning: passing argument 2 of `bb->vm_ops->page_mkwrite' from incompatible pointer type fs/sysfs/bin.c: At top level: fs/sysfs/bin.c:280: warning: initialization from incompatible pointer type Expects to have my [PATCH next] sysfs: fix some bin_vm_ops errors Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sysfs/bin.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 07703d3ff4a1..93e0c0281d45 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -234,7 +234,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } -static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct bin_buffer *bb = file->private_data; @@ -242,15 +242,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) int ret; if (!bb->vm_ops) - return -EINVAL; + return VM_FAULT_SIGBUS; if (!bb->vm_ops->page_mkwrite) return 0; if (!sysfs_get_active_two(attr_sd)) - return -EINVAL; + return VM_FAULT_SIGBUS; - ret = bb->vm_ops->page_mkwrite(vma, page); + ret = bb->vm_ops->page_mkwrite(vma, vmf); sysfs_put_active_two(attr_sd); return ret; From f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Mar 2009 15:23:25 -0700 Subject: [PATCH 334/478] mm: introduce debug_kmap_atomic x86 has debug_kmap_atomic_prot() which is error checking function for kmap_atomic. It is usefull for the other architectures, although it needs CONFIG_TRACE_IRQFLAGS_SUPPORT. This patch exposes it to the other architectures. Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/highmem_32.c | 45 +--------------------------------------- include/linux/highmem.h | 12 +++++++++++ mm/highmem.c | 45 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 44 deletions(-) diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 522db5e3d0bf..8126e8d1a2a4 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -19,49 +19,6 @@ void kunmap(struct page *page) kunmap_high(page); } -static void debug_kmap_atomic_prot(enum km_type type) -{ -#ifdef CONFIG_DEBUG_HIGHMEM - static unsigned warn_count = 10; - - if (unlikely(warn_count == 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#endif -} - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -81,7 +38,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic_prot(type); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 13875ce9112a..7ff5c55f9b55 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -187,4 +187,16 @@ static inline void copy_highpage(struct page *to, struct page *from) kunmap_atomic(vto, KM_USER1); } +#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) + +void debug_kmap_atomic(enum km_type type); + +#else + +static inline void debug_kmap_atomic(enum km_type type) +{ +} + +#endif + #endif /* _LINUX_HIGHMEM_H */ diff --git a/mm/highmem.c b/mm/highmem.c index 910198037bf5..68eb1d9b63fa 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -422,3 +422,48 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ + +#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) + +void debug_kmap_atomic(enum km_type type) +{ + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +} + +#endif From 7ca43e7564679604d86e9ed834e7bbcffd8a4a3f Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Mar 2009 15:23:25 -0700 Subject: [PATCH 335/478] mm: use debug_kmap_atomic Use debug_kmap_atomic in kmap_atomic, kmap_atomic_pfn, and iomap_atomic_prot_pfn. Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/highmem.c | 2 ++ arch/powerpc/include/asm/highmem.h | 2 ++ arch/sparc/mm/highmem.c | 1 + arch/x86/mm/highmem_32.c | 1 + arch/x86/mm/iomap_32.c | 2 ++ include/asm-frv/highmem.h | 2 ++ include/asm-mn10300/highmem.h | 2 ++ 7 files changed, 12 insertions(+) diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 060d28dca8a8..4481656d1065 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -42,6 +42,7 @@ void *__kmap_atomic(struct page *page, enum km_type type) if (!PageHighMem(page)) return page_address(page); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM @@ -88,6 +89,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) pagefault_disable(); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 545028f86488..684a73f4324f 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -94,6 +95,7 @@ static inline void *kmap_atomic_prot(struct page *page, enum km_type type, pgpro if (!PageHighMem(page)) return page_address(page); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 752d0c9fb544..7916feba6e4a 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -39,6 +39,7 @@ void *kmap_atomic(struct page *page, enum km_type type) if (!PageHighMem(page)) return page_address(page); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 8126e8d1a2a4..5bc5d1688c1c 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -40,6 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) debug_kmap_atomic(type); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 699c9b2895ae..bff0c9032f8c 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -19,6 +19,7 @@ #include #include #include +#include int is_io_mapping_possible(resource_size_t base, unsigned long size) { @@ -71,6 +72,7 @@ iounmap_atomic(void *kvaddr, enum km_type type) unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + debug_kmap_atomic(type); /* * Force other mappings to Oops if they'll try to access this pte * without first remap it. Keeping stale mappings around is a bad idea diff --git a/include/asm-frv/highmem.h b/include/asm-frv/highmem.h index 26cefcde5cee..68e4677fb9e7 100644 --- a/include/asm-frv/highmem.h +++ b/include/asm-frv/highmem.h @@ -18,6 +18,7 @@ #ifdef __KERNEL__ #include +#include #include #include #include @@ -116,6 +117,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type type) unsigned long paddr; pagefault_disable(); + debug_kmap_atomic(type); paddr = page_to_phys(page); switch (type) { diff --git a/include/asm-mn10300/highmem.h b/include/asm-mn10300/highmem.h index 5256854c0453..90f2abb04bfd 100644 --- a/include/asm-mn10300/highmem.h +++ b/include/asm-mn10300/highmem.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -77,6 +78,7 @@ static inline unsigned long kmap_atomic(struct page *page, enum km_type type) if (page < highmem_start_page) return page_address(page); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #if HIGHMEM_DEBUG From 33925b25d2c00a29664f1994ab350a9bff70f7a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 31 Mar 2009 15:23:26 -0700 Subject: [PATCH 336/478] nommu: there is no mlock() for NOMMU, so don't provide the bits The mlock() facility does not exist for NOMMU since all mappings are effectively locked anyway, so we don't make the bits available when they're not useful. Signed-off-by: David Howells Reviewed-by: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Greg Ungerer Cc: Johannes Weiner Cc: Rik van Riel Cc: Lee Schermerhorn Cc: Enrik Berkhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 28 +++++++++++++++++----------- mm/Kconfig | 8 ++++++++ mm/internal.h | 8 +++++--- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 219a523ecdb0..61df1779b2a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -96,6 +96,8 @@ enum pageflags { PG_swapbacked, /* Page is backed by RAM/swap */ #ifdef CONFIG_UNEVICTABLE_LRU PG_unevictable, /* Page is "unevictable" */ +#endif +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR @@ -234,22 +236,22 @@ PAGEFLAG_FALSE(SwapCache) #ifdef CONFIG_UNEVICTABLE_LRU PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) TESTCLEARFLAG(Unevictable, unevictable) - -#define MLOCK_PAGES 1 -PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) - TESTSCFLAG(Mlocked, mlocked) - #else - -#define MLOCK_PAGES 0 -PAGEFLAG_FALSE(Mlocked) - SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) - PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) __CLEARPAGEFLAG_NOOP(Unevictable) #endif +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT +#define MLOCK_PAGES 1 +PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) + TESTSCFLAG(Mlocked, mlocked) +#else +#define MLOCK_PAGES 0 +PAGEFLAG_FALSE(Mlocked) + SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR PAGEFLAG(Uncached, uncached) #else @@ -367,9 +369,13 @@ static inline void __ClearPageTail(struct page *page) #ifdef CONFIG_UNEVICTABLE_LRU #define __PG_UNEVICTABLE (1 << PG_unevictable) -#define __PG_MLOCKED (1 << PG_mlocked) #else #define __PG_UNEVICTABLE 0 +#endif + +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT +#define __PG_MLOCKED (1 << PG_mlocked) +#else #define __PG_MLOCKED 0 #endif diff --git a/mm/Kconfig b/mm/Kconfig index a5b77811fdf2..8c895973dfba 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -214,5 +214,13 @@ config UNEVICTABLE_LRU will use one page flag and increase the code size a little, say Y unless you know what you are doing. +config HAVE_MLOCK + bool + default y if MMU=y + +config HAVE_MLOCKED_PAGE_BIT + bool + default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + config MMU_NOTIFIER bool diff --git a/mm/internal.h b/mm/internal.h index 478223b73a2a..987bb03fbdd8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +#ifdef CONFIG_HAVE_MLOCK extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void munlock_vma_pages_range(struct vm_area_struct *vma, @@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } +#endif #ifdef CONFIG_UNEVICTABLE_LRU /* @@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * Called only in fault path via page_evictable() for a new page * to determine if it's being mapped into a LOCKED vma. @@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page) } } -#else /* CONFIG_UNEVICTABLE_LRU */ +#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { return 0; @@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline void free_page_mlock(struct page *page) { } -#endif /* CONFIG_UNEVICTABLE_LRU */ +#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ /* * Return the mem_map entry representing the 'offset' subpage within From 71aa653c6bfa6743d838342105ebc067145394e4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 31 Mar 2009 15:23:28 -0700 Subject: [PATCH 337/478] nommu: make CONFIG_UNEVICTABLE_LRU available when CONFIG_MMU=n Make CONFIG_UNEVICTABLE_LRU available when CONFIG_MMU=n. There's no logical reason it shouldn't be available, and it can be used for ramfs. Signed-off-by: David Howells Reviewed-by: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Greg Ungerer Cc: Johannes Weiner Cc: Rik van Riel Cc: Lee Schermerhorn Cc: Enrik Berkhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 8c895973dfba..b53427ad30a3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -206,7 +206,6 @@ config VIRT_TO_BUS config UNEVICTABLE_LRU bool "Add LRU list to track non-evictable pages" default y - depends on MMU help Keeps unevictable pages off of the active and inactive pageout lists, so kswapd will not waste CPU time or have its balancing From 88c3bd707c2552bcef93cc3724647903aece159d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Mar 2009 15:23:29 -0700 Subject: [PATCH 338/478] vmscan: print shrink_slab symbol name on negative shrinker objects When a shrinker has a negative number of objects to delete, the symbol name of the shrinker should be printed, not shrink_slab. This also makes the error message slightly more informative. Cc: Ingo Molnar Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e70fae31e968..f4619c6cd59e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -214,8 +214,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, do_div(delta, lru_pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) { - printk(KERN_ERR "%s: nr=%ld\n", - __func__, shrinker->nr); + printk(KERN_ERR "shrink_slab: %pF negative objects to " + "delete nr=%ld\n", + shrinker->shrink, shrinker->nr); shrinker->nr = max_pass; } From 2678958e1225f350806d90f211a3b475f64aee80 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:23:29 -0700 Subject: [PATCH 339/478] ramfs-nommu: use generic lru cache Instead of open-coding the lru-list-add pagevec batching when expanding a file mapping from zero, defer to the appropriate page cache function that also takes care of adding the page to the lru list. This is cleaner, saves code and reduces the stack footprint by 16 words worth of pagevec. Signed-off-by: Johannes Weiner Acked-by: David Howells Cc: Nick Piggin Acked-by: KOSAKI Motohiro Cc: Rik van Riel Cc: Peter Zijlstra Cc: MinChan Kim Cc: Lee Schermerhorn Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 995ef1d6686c..ebb2c417912c 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = { */ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { - struct pagevec lru_pvec; unsigned long npages, xpages, loop, limit; struct page *pages; unsigned order; @@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) memset(data, 0, newsize); /* attach all the pages to the inode's address space */ - pagevec_init(&lru_pvec, 0); for (loop = 0; loop < npages; loop++) { struct page *page = pages + loop; - ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); + ret = add_to_page_cache_lru(page, inode->i_mapping, loop, + GFP_KERNEL); if (ret < 0) goto add_error; - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add_file(&lru_pvec); - /* prevent the page from being discarded on memory pressure */ SetPageDirty(page); unlock_page(page); } - pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: @@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return -EFBIG; add_error: - pagevec_lru_add_file(&lru_pvec); - page_cache_release(pages + loop); - for (loop++; loop < npages; loop++) - __free_page(pages + loop); + while (loop < npages) + __free_page(pages + loop++); return ret; } From 327c0e968645f2601a43f5ea7c19c7b3a5fa0a34 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 31 Mar 2009 15:23:31 -0700 Subject: [PATCH 340/478] vmscan: fix it to take care of nodemask try_to_free_pages() is used for the direct reclaim of up to SWAP_CLUSTER_MAX pages when watermarks are low. The caller to alloc_pages_nodemask() can specify a nodemask of nodes that are allowed to be used but this is not passed to try_to_free_pages(). This can lead to unnecessary reclaim of pages that are unusable by the caller and int the worst case lead to allocation failure as progress was not been make where it is needed. This patch passes the nodemask used for alloc_pages_nodemask() to try_to_free_pages(). Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- include/linux/swap.h | 2 +- mm/page_alloc.c | 3 ++- mm/vmscan.c | 13 +++++++++++-- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 0c14f8d52ee5..c77b848c3d43 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -290,7 +290,7 @@ static void free_more_memory(void) &zone); if (zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS); + GFP_NOFS, NULL); } } diff --git a/include/linux/swap.h b/include/linux/swap.h index d30215578877..b8b0c4ce83e6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -212,7 +212,7 @@ static inline void lru_cache_add_active_file(struct page *page) /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask); + gfp_t gfp_mask, nodemask_t *mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbd532161f68..0284e528748d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1582,7 +1582,8 @@ nofail_alloc: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); + did_some_progress = try_to_free_pages(zonelist, order, + gfp_mask, nodemask); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); diff --git a/mm/vmscan.c b/mm/vmscan.c index f4619c6cd59e..06e72693b458 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -78,6 +78,12 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + /* Pluggable isolate pages callback */ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, @@ -1538,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; sc->all_unreclaimable = 1; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1683,7 +1690,7 @@ out: } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, nodemask_t *nodemask) { struct scan_control sc = { .gfp_mask = gfp_mask, @@ -1694,6 +1701,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, + .nodemask = nodemask, }; return do_try_to_free_pages(zonelist, &sc); @@ -1714,6 +1722,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, + .nodemask = NULL, /* we don't care the placement */ }; struct zonelist *zonelist; From 9fab5619bdd7f84cdd22cc760778f759f9819a33 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 31 Mar 2009 15:23:33 -0700 Subject: [PATCH 341/478] shmem: writepage directly to swap Synopsis: if shmem_writepage calls swap_writepage directly, most shmem swap loads benefit, and a catastrophic interaction between SLUB and some flash storage is avoided. shmem_writepage() has always been peculiar in making no attempt to write: it has just transferred a shmem page from file cache to swap cache, then let that page make its way around the LRU again before being written and freed. The idea was that people use tmpfs because they want those pages to stay in RAM; so although we give it an overflow to swap, we should resist writing too soon, giving those pages a second chance before they can be reclaimed. That was always questionable, and I've toyed with this patch for years; but never had a clear justification to depart from the original design. It became more questionable in 2.6.28, when the split LRU patches classed shmem and tmpfs pages as SwapBacked rather than as file_cache: that in itself gives them more resistance to reclaim than normal file pages. I prepared this patch for 2.6.29, but the merge window arrived before I'd completed gathering statistics to justify sending it in. Then while comparing SLQB against SLUB, running SLUB on a laptop I'd habitually used with SLAB, I found SLUB to run my tmpfs kbuild swapping tests five times slower than SLAB or SLQB - other machines slower too, but nowhere near so bad. Simpler "cp -a" swapping tests showed the same. slub_max_order=0 brings sanity to all, but heavy swapping is too far from normal to justify such a tuning. The crucial factor on that laptop turns out to be that I'm using an SD card for swap. What happens is this: By default, SLUB uses order-2 pages for shmem_inode_cache (and many other fs inodes), so creating tmpfs files under memory pressure brings lumpy reclaim into play. One subpage of the order is chosen from the bottom of the LRU as usual, then the other three picked out from their random positions on the LRUs. In a tmpfs load, many of these pages will be ones which already passed through shmem_writepage, so already have swap allocated. And though their offsets on swap were probably allocated sequentially, now that the pages are picked off at random, their swap offsets are scattered. But the flash storage on the SD card is very sensitive to having its writes merged: once swap is written at scattered offsets, performance falls apart. Rotating disk seeks increase too, but less disastrously. So: stop giving shmem/tmpfs pages a second pass around the LRU, write them out to swap as soon as their swap has been allocated. It's surely possible to devise an artificial load which runs faster the old way, one whose sizing is such that the tmpfs pages on their second pass are the ones that are wanted again, and other pages not. But I've not yet found such a load: on all machines, under the loads I've tried, immediate swap_writepage speeds up shmem swapping: especially when using the SLUB allocator (and more effectively than slub_max_order=0), but also with the others; and it also reduces the variance between runs. How much faster varies widely: a factor of five is rare, 5% is common. One load which might have suffered: imagine a swapping shmem load in a limited mem_cgroup on a machine with plenty of memory. Before 2.6.29 the swapcache was not charged, and such a load would have run quickest with the shmem swapcache never written to swap. But now swapcache is charged, so even this load benefits from shmem_writepage directly to swap. Apologies for the #ifndef CONFIG_SWAP swap_writepage() stub in swap.h: it's silly because that will never get called; but refactoring shmem.c sensibly according to CONFIG_SWAP will be a separate task. Signed-off-by: Hugh Dickins Acked-by: Pekka Enberg Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +++++ mm/shmem.c | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index b8b0c4ce83e6..62d81435347a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -382,6 +382,11 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +{ + return 0; +} + static inline struct page *lookup_swap_cache(swp_entry_t swp) { return NULL; diff --git a/mm/shmem.c b/mm/shmem.c index 7ec78e24a30d..d94d2e9146bc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1068,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swap_duplicate(swap); BUG_ON(page_mapped(page)); page_cache_release(page); /* pagecache ref */ - set_page_dirty(page); - unlock_page(page); + swap_writepage(page, wbc); if (inode) { mutex_lock(&shmem_swaplist_mutex); /* move instead of add in case we're racing */ From 0b42afd0a3051952924ed373dcda61beb23fcb58 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:23:34 -0700 Subject: [PATCH 342/478] alpha: fix macros When this macros isn't called with 'fixup', e.g. with foo this will incorectly expand to foo->foo.bits.errreg Signed-off-by: Roel Kluin Cc: Ivan Kokshaysky Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/uaccess.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h index 22de3b434a22..163f3053001c 100644 --- a/arch/alpha/include/asm/uaccess.h +++ b/arch/alpha/include/asm/uaccess.h @@ -498,13 +498,13 @@ struct exception_table_entry }; /* Returns the new pc */ -#define fixup_exception(map_reg, fixup, pc) \ +#define fixup_exception(map_reg, _fixup, pc) \ ({ \ - if ((fixup)->fixup.bits.valreg != 31) \ - map_reg((fixup)->fixup.bits.valreg) = 0; \ - if ((fixup)->fixup.bits.errreg != 31) \ - map_reg((fixup)->fixup.bits.errreg) = -EFAULT; \ - (pc) + (fixup)->fixup.bits.nextinsn; \ + if ((_fixup)->fixup.bits.valreg != 31) \ + map_reg((_fixup)->fixup.bits.valreg) = 0; \ + if ((_fixup)->fixup.bits.errreg != 31) \ + map_reg((_fixup)->fixup.bits.errreg) = -EFAULT; \ + (pc) + (_fixup)->fixup.bits.nextinsn; \ }) From a94066992b3050a7bd9a82bf73bf19f6052d2f82 Mon Sep 17 00:00:00 2001 From: Cheng Renquan Date: Tue, 31 Mar 2009 15:23:35 -0700 Subject: [PATCH 343/478] MAINTAINERS: add the missing linux alpha port mailling list Signed-off-by: Cheng Renquan Cc: Richard Henderson Cc: Ivan Kokshaysky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c5f4e9d27b64..a64f2920a8f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -357,6 +357,7 @@ S: Odd Fixes for 2.4; Maintained for 2.6. P: Ivan Kokshaysky M: ink@jurassic.park.msu.ru S: Maintained for 2.4; PCI support for 2.6. +L: linux-alpha@vger.kernel.org AMD GEODE CS5536 USB DEVICE CONTROLLER DRIVER P: Thomas Dahlmann From a6209d6d71f2ab8c63cc1587ef65490d83022baf Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 31 Mar 2009 15:23:35 -0700 Subject: [PATCH 344/478] alpha: xchg/cmpxchg cleanup and fixes - "_local" versions of xchg/cmpxchg functions duplicate code of non-local ones (quite a few pages of assembler), except memory barriers. We can generate these two variants from a single header file using simple macros; - convert xchg macro back to inline function using always_inline attribute; - use proper argument types for cmpxchg_u8/u16 functions to fix a problem with negative arguments. Signed-off-by: Ivan Kokshaysky Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/system.h | 559 ++++---------------------------- arch/alpha/include/asm/xchg.h | 258 +++++++++++++++ 2 files changed, 314 insertions(+), 503 deletions(-) create mode 100644 arch/alpha/include/asm/xchg.h diff --git a/arch/alpha/include/asm/system.h b/arch/alpha/include/asm/system.h index afe20fa58c99..5aa40cca4f23 100644 --- a/arch/alpha/include/asm/system.h +++ b/arch/alpha/include/asm/system.h @@ -309,519 +309,72 @@ extern int __min_ipl; #define tbia() __tbi(-2, /* no second argument */) /* - * Atomic exchange. - * Since it can be used to implement critical sections - * it must clobber "memory" (also for interrupts in UP). + * Atomic exchange routines. */ -static inline unsigned long -__xchg_u8(volatile char *m, unsigned long val) -{ - unsigned long ret, tmp, addr64; +#define __ASM__MB +#define ____xchg(type, args...) __xchg ## type ## _local(args) +#define ____cmpxchg(type, args...) __cmpxchg ## type ## _local(args) +#include - __asm__ __volatile__( - " andnot %4,7,%3\n" - " insbl %1,%4,%1\n" - "1: ldq_l %2,0(%3)\n" - " extbl %2,%4,%0\n" - " mskbl %2,%4,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%3)\n" - " beq %2,2f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64) - : "r" ((long)m), "1" (val) : "memory"); - - return ret; -} - -static inline unsigned long -__xchg_u16(volatile short *m, unsigned long val) -{ - unsigned long ret, tmp, addr64; - - __asm__ __volatile__( - " andnot %4,7,%3\n" - " inswl %1,%4,%1\n" - "1: ldq_l %2,0(%3)\n" - " extwl %2,%4,%0\n" - " mskwl %2,%4,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%3)\n" - " beq %2,2f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64) - : "r" ((long)m), "1" (val) : "memory"); - - return ret; -} - -static inline unsigned long -__xchg_u32(volatile int *m, unsigned long val) -{ - unsigned long dummy; - - __asm__ __volatile__( - "1: ldl_l %0,%4\n" - " bis $31,%3,%1\n" - " stl_c %1,%2\n" - " beq %1,2f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (val), "=&r" (dummy), "=m" (*m) - : "rI" (val), "m" (*m) : "memory"); - - return val; -} - -static inline unsigned long -__xchg_u64(volatile long *m, unsigned long val) -{ - unsigned long dummy; - - __asm__ __volatile__( - "1: ldq_l %0,%4\n" - " bis $31,%3,%1\n" - " stq_c %1,%2\n" - " beq %1,2f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (val), "=&r" (dummy), "=m" (*m) - : "rI" (val), "m" (*m) : "memory"); - - return val; -} - -/* This function doesn't exist, so you'll get a linker error - if something tries to do an invalid xchg(). */ -extern void __xchg_called_with_bad_pointer(void); - -#define __xchg(ptr, x, size) \ -({ \ - unsigned long __xchg__res; \ - volatile void *__xchg__ptr = (ptr); \ - switch (size) { \ - case 1: __xchg__res = __xchg_u8(__xchg__ptr, x); break; \ - case 2: __xchg__res = __xchg_u16(__xchg__ptr, x); break; \ - case 4: __xchg__res = __xchg_u32(__xchg__ptr, x); break; \ - case 8: __xchg__res = __xchg_u64(__xchg__ptr, x); break; \ - default: __xchg_called_with_bad_pointer(); __xchg__res = x; \ - } \ - __xchg__res; \ -}) - -#define xchg(ptr,x) \ - ({ \ - __typeof__(*(ptr)) _x_ = (x); \ - (__typeof__(*(ptr))) __xchg((ptr), (unsigned long)_x_, sizeof(*(ptr))); \ +#define xchg_local(ptr,x) \ + ({ \ + __typeof__(*(ptr)) _x_ = (x); \ + (__typeof__(*(ptr))) __xchg_local((ptr), (unsigned long)_x_, \ + sizeof(*(ptr))); \ }) -static inline unsigned long -__xchg_u8_local(volatile char *m, unsigned long val) -{ - unsigned long ret, tmp, addr64; - - __asm__ __volatile__( - " andnot %4,7,%3\n" - " insbl %1,%4,%1\n" - "1: ldq_l %2,0(%3)\n" - " extbl %2,%4,%0\n" - " mskbl %2,%4,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%3)\n" - " beq %2,2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64) - : "r" ((long)m), "1" (val) : "memory"); - - return ret; -} - -static inline unsigned long -__xchg_u16_local(volatile short *m, unsigned long val) -{ - unsigned long ret, tmp, addr64; - - __asm__ __volatile__( - " andnot %4,7,%3\n" - " inswl %1,%4,%1\n" - "1: ldq_l %2,0(%3)\n" - " extwl %2,%4,%0\n" - " mskwl %2,%4,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%3)\n" - " beq %2,2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64) - : "r" ((long)m), "1" (val) : "memory"); - - return ret; -} - -static inline unsigned long -__xchg_u32_local(volatile int *m, unsigned long val) -{ - unsigned long dummy; - - __asm__ __volatile__( - "1: ldl_l %0,%4\n" - " bis $31,%3,%1\n" - " stl_c %1,%2\n" - " beq %1,2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (val), "=&r" (dummy), "=m" (*m) - : "rI" (val), "m" (*m) : "memory"); - - return val; -} - -static inline unsigned long -__xchg_u64_local(volatile long *m, unsigned long val) -{ - unsigned long dummy; - - __asm__ __volatile__( - "1: ldq_l %0,%4\n" - " bis $31,%3,%1\n" - " stq_c %1,%2\n" - " beq %1,2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - : "=&r" (val), "=&r" (dummy), "=m" (*m) - : "rI" (val), "m" (*m) : "memory"); - - return val; -} - -#define __xchg_local(ptr, x, size) \ -({ \ - unsigned long __xchg__res; \ - volatile void *__xchg__ptr = (ptr); \ - switch (size) { \ - case 1: __xchg__res = __xchg_u8_local(__xchg__ptr, x); break; \ - case 2: __xchg__res = __xchg_u16_local(__xchg__ptr, x); break; \ - case 4: __xchg__res = __xchg_u32_local(__xchg__ptr, x); break; \ - case 8: __xchg__res = __xchg_u64_local(__xchg__ptr, x); break; \ - default: __xchg_called_with_bad_pointer(); __xchg__res = x; \ - } \ - __xchg__res; \ -}) - -#define xchg_local(ptr,x) \ - ({ \ - __typeof__(*(ptr)) _x_ = (x); \ - (__typeof__(*(ptr))) __xchg_local((ptr), (unsigned long)_x_, \ - sizeof(*(ptr))); \ +#define cmpxchg_local(ptr, o, n) \ + ({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg_local((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, \ + sizeof(*(ptr))); \ }) -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - * - * The memory barrier should be placed in SMP only when we actually - * make the change. If we don't change anything (so if the returned - * prev is equal to old) then we aren't acquiring anything new and - * we don't need any memory barrier as far I can tell. - */ +#define cmpxchg64_local(ptr, o, n) \ + ({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + cmpxchg_local((ptr), (o), (n)); \ + }) + +#ifdef CONFIG_SMP +#undef __ASM__MB +#define __ASM__MB "\tmb\n" +#endif +#undef ____xchg +#undef ____cmpxchg +#define ____xchg(type, args...) __xchg ##type(args) +#define ____cmpxchg(type, args...) __cmpxchg ##type(args) +#include + +#define xchg(ptr,x) \ + ({ \ + __typeof__(*(ptr)) _x_ = (x); \ + (__typeof__(*(ptr))) __xchg((ptr), (unsigned long)_x_, \ + sizeof(*(ptr))); \ + }) + +#define cmpxchg(ptr, o, n) \ + ({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr)));\ + }) + +#define cmpxchg64(ptr, o, n) \ + ({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + cmpxchg((ptr), (o), (n)); \ + }) + +#undef __ASM__MB +#undef ____cmpxchg #define __HAVE_ARCH_CMPXCHG 1 -static inline unsigned long -__cmpxchg_u8(volatile char *m, long old, long new) -{ - unsigned long prev, tmp, cmp, addr64; - - __asm__ __volatile__( - " andnot %5,7,%4\n" - " insbl %1,%5,%1\n" - "1: ldq_l %2,0(%4)\n" - " extbl %2,%5,%0\n" - " cmpeq %0,%6,%3\n" - " beq %3,2f\n" - " mskbl %2,%5,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%4)\n" - " beq %2,3f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64) - : "r" ((long)m), "Ir" (old), "1" (new) : "memory"); - - return prev; -} - -static inline unsigned long -__cmpxchg_u16(volatile short *m, long old, long new) -{ - unsigned long prev, tmp, cmp, addr64; - - __asm__ __volatile__( - " andnot %5,7,%4\n" - " inswl %1,%5,%1\n" - "1: ldq_l %2,0(%4)\n" - " extwl %2,%5,%0\n" - " cmpeq %0,%6,%3\n" - " beq %3,2f\n" - " mskwl %2,%5,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%4)\n" - " beq %2,3f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64) - : "r" ((long)m), "Ir" (old), "1" (new) : "memory"); - - return prev; -} - -static inline unsigned long -__cmpxchg_u32(volatile int *m, int old, int new) -{ - unsigned long prev, cmp; - - __asm__ __volatile__( - "1: ldl_l %0,%5\n" - " cmpeq %0,%3,%1\n" - " beq %1,2f\n" - " mov %4,%1\n" - " stl_c %1,%2\n" - " beq %1,3f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r"(prev), "=&r"(cmp), "=m"(*m) - : "r"((long) old), "r"(new), "m"(*m) : "memory"); - - return prev; -} - -static inline unsigned long -__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new) -{ - unsigned long prev, cmp; - - __asm__ __volatile__( - "1: ldq_l %0,%5\n" - " cmpeq %0,%3,%1\n" - " beq %1,2f\n" - " mov %4,%1\n" - " stq_c %1,%2\n" - " beq %1,3f\n" -#ifdef CONFIG_SMP - " mb\n" -#endif - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r"(prev), "=&r"(cmp), "=m"(*m) - : "r"((long) old), "r"(new), "m"(*m) : "memory"); - - return prev; -} - -/* This function doesn't exist, so you'll get a linker error - if something tries to do an invalid cmpxchg(). */ -extern void __cmpxchg_called_with_bad_pointer(void); - -static __always_inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) -{ - switch (size) { - case 1: - return __cmpxchg_u8(ptr, old, new); - case 2: - return __cmpxchg_u16(ptr, old, new); - case 4: - return __cmpxchg_u32(ptr, old, new); - case 8: - return __cmpxchg_u64(ptr, old, new); - } - __cmpxchg_called_with_bad_pointer(); - return old; -} - -#define cmpxchg(ptr, o, n) \ - ({ \ - __typeof__(*(ptr)) _o_ = (o); \ - __typeof__(*(ptr)) _n_ = (n); \ - (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ - (unsigned long)_n_, sizeof(*(ptr))); \ - }) -#define cmpxchg64(ptr, o, n) \ - ({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - cmpxchg((ptr), (o), (n)); \ - }) - -static inline unsigned long -__cmpxchg_u8_local(volatile char *m, long old, long new) -{ - unsigned long prev, tmp, cmp, addr64; - - __asm__ __volatile__( - " andnot %5,7,%4\n" - " insbl %1,%5,%1\n" - "1: ldq_l %2,0(%4)\n" - " extbl %2,%5,%0\n" - " cmpeq %0,%6,%3\n" - " beq %3,2f\n" - " mskbl %2,%5,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%4)\n" - " beq %2,3f\n" - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64) - : "r" ((long)m), "Ir" (old), "1" (new) : "memory"); - - return prev; -} - -static inline unsigned long -__cmpxchg_u16_local(volatile short *m, long old, long new) -{ - unsigned long prev, tmp, cmp, addr64; - - __asm__ __volatile__( - " andnot %5,7,%4\n" - " inswl %1,%5,%1\n" - "1: ldq_l %2,0(%4)\n" - " extwl %2,%5,%0\n" - " cmpeq %0,%6,%3\n" - " beq %3,2f\n" - " mskwl %2,%5,%2\n" - " or %1,%2,%2\n" - " stq_c %2,0(%4)\n" - " beq %2,3f\n" - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64) - : "r" ((long)m), "Ir" (old), "1" (new) : "memory"); - - return prev; -} - -static inline unsigned long -__cmpxchg_u32_local(volatile int *m, int old, int new) -{ - unsigned long prev, cmp; - - __asm__ __volatile__( - "1: ldl_l %0,%5\n" - " cmpeq %0,%3,%1\n" - " beq %1,2f\n" - " mov %4,%1\n" - " stl_c %1,%2\n" - " beq %1,3f\n" - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r"(prev), "=&r"(cmp), "=m"(*m) - : "r"((long) old), "r"(new), "m"(*m) : "memory"); - - return prev; -} - -static inline unsigned long -__cmpxchg_u64_local(volatile long *m, unsigned long old, unsigned long new) -{ - unsigned long prev, cmp; - - __asm__ __volatile__( - "1: ldq_l %0,%5\n" - " cmpeq %0,%3,%1\n" - " beq %1,2f\n" - " mov %4,%1\n" - " stq_c %1,%2\n" - " beq %1,3f\n" - "2:\n" - ".subsection 2\n" - "3: br 1b\n" - ".previous" - : "=&r"(prev), "=&r"(cmp), "=m"(*m) - : "r"((long) old), "r"(new), "m"(*m) : "memory"); - - return prev; -} - -static __always_inline unsigned long -__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, - int size) -{ - switch (size) { - case 1: - return __cmpxchg_u8_local(ptr, old, new); - case 2: - return __cmpxchg_u16_local(ptr, old, new); - case 4: - return __cmpxchg_u32_local(ptr, old, new); - case 8: - return __cmpxchg_u64_local(ptr, old, new); - } - __cmpxchg_called_with_bad_pointer(); - return old; -} - -#define cmpxchg_local(ptr, o, n) \ - ({ \ - __typeof__(*(ptr)) _o_ = (o); \ - __typeof__(*(ptr)) _n_ = (n); \ - (__typeof__(*(ptr))) __cmpxchg_local((ptr), (unsigned long)_o_, \ - (unsigned long)_n_, sizeof(*(ptr))); \ - }) -#define cmpxchg64_local(ptr, o, n) \ - ({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - cmpxchg_local((ptr), (o), (n)); \ - }) - - #endif /* __ASSEMBLY__ */ #define arch_align_stack(x) (x) diff --git a/arch/alpha/include/asm/xchg.h b/arch/alpha/include/asm/xchg.h new file mode 100644 index 000000000000..beba1b803e0d --- /dev/null +++ b/arch/alpha/include/asm/xchg.h @@ -0,0 +1,258 @@ +#ifndef __ALPHA_SYSTEM_H +#error Do not include xchg.h directly! +#else +/* + * xchg/xchg_local and cmpxchg/cmpxchg_local share the same code + * except that local version do not have the expensive memory barrier. + * So this file is included twice from asm/system.h. + */ + +/* + * Atomic exchange. + * Since it can be used to implement critical sections + * it must clobber "memory" (also for interrupts in UP). + */ + +static inline unsigned long +____xchg(_u8, volatile char *m, unsigned long val) +{ + unsigned long ret, tmp, addr64; + + __asm__ __volatile__( + " andnot %4,7,%3\n" + " insbl %1,%4,%1\n" + "1: ldq_l %2,0(%3)\n" + " extbl %2,%4,%0\n" + " mskbl %2,%4,%2\n" + " or %1,%2,%2\n" + " stq_c %2,0(%3)\n" + " beq %2,2f\n" + __ASM__MB + ".subsection 2\n" + "2: br 1b\n" + ".previous" + : "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64) + : "r" ((long)m), "1" (val) : "memory"); + + return ret; +} + +static inline unsigned long +____xchg(_u16, volatile short *m, unsigned long val) +{ + unsigned long ret, tmp, addr64; + + __asm__ __volatile__( + " andnot %4,7,%3\n" + " inswl %1,%4,%1\n" + "1: ldq_l %2,0(%3)\n" + " extwl %2,%4,%0\n" + " mskwl %2,%4,%2\n" + " or %1,%2,%2\n" + " stq_c %2,0(%3)\n" + " beq %2,2f\n" + __ASM__MB + ".subsection 2\n" + "2: br 1b\n" + ".previous" + : "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64) + : "r" ((long)m), "1" (val) : "memory"); + + return ret; +} + +static inline unsigned long +____xchg(_u32, volatile int *m, unsigned long val) +{ + unsigned long dummy; + + __asm__ __volatile__( + "1: ldl_l %0,%4\n" + " bis $31,%3,%1\n" + " stl_c %1,%2\n" + " beq %1,2f\n" + __ASM__MB + ".subsection 2\n" + "2: br 1b\n" + ".previous" + : "=&r" (val), "=&r" (dummy), "=m" (*m) + : "rI" (val), "m" (*m) : "memory"); + + return val; +} + +static inline unsigned long +____xchg(_u64, volatile long *m, unsigned long val) +{ + unsigned long dummy; + + __asm__ __volatile__( + "1: ldq_l %0,%4\n" + " bis $31,%3,%1\n" + " stq_c %1,%2\n" + " beq %1,2f\n" + __ASM__MB + ".subsection 2\n" + "2: br 1b\n" + ".previous" + : "=&r" (val), "=&r" (dummy), "=m" (*m) + : "rI" (val), "m" (*m) : "memory"); + + return val; +} + +/* This function doesn't exist, so you'll get a linker error + if something tries to do an invalid xchg(). */ +extern void __xchg_called_with_bad_pointer(void); + +static __always_inline unsigned long +____xchg(, volatile void *ptr, unsigned long x, int size) +{ + switch (size) { + case 1: + return ____xchg(_u8, ptr, x); + case 2: + return ____xchg(_u16, ptr, x); + case 4: + return ____xchg(_u32, ptr, x); + case 8: + return ____xchg(_u64, ptr, x); + } + __xchg_called_with_bad_pointer(); + return x; +} + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + * + * The memory barrier should be placed in SMP only when we actually + * make the change. If we don't change anything (so if the returned + * prev is equal to old) then we aren't acquiring anything new and + * we don't need any memory barrier as far I can tell. + */ + +static inline unsigned long +____cmpxchg(_u8, volatile char *m, unsigned char old, unsigned char new) +{ + unsigned long prev, tmp, cmp, addr64; + + __asm__ __volatile__( + " andnot %5,7,%4\n" + " insbl %1,%5,%1\n" + "1: ldq_l %2,0(%4)\n" + " extbl %2,%5,%0\n" + " cmpeq %0,%6,%3\n" + " beq %3,2f\n" + " mskbl %2,%5,%2\n" + " or %1,%2,%2\n" + " stq_c %2,0(%4)\n" + " beq %2,3f\n" + __ASM__MB + "2:\n" + ".subsection 2\n" + "3: br 1b\n" + ".previous" + : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64) + : "r" ((long)m), "Ir" (old), "1" (new) : "memory"); + + return prev; +} + +static inline unsigned long +____cmpxchg(_u16, volatile short *m, unsigned short old, unsigned short new) +{ + unsigned long prev, tmp, cmp, addr64; + + __asm__ __volatile__( + " andnot %5,7,%4\n" + " inswl %1,%5,%1\n" + "1: ldq_l %2,0(%4)\n" + " extwl %2,%5,%0\n" + " cmpeq %0,%6,%3\n" + " beq %3,2f\n" + " mskwl %2,%5,%2\n" + " or %1,%2,%2\n" + " stq_c %2,0(%4)\n" + " beq %2,3f\n" + __ASM__MB + "2:\n" + ".subsection 2\n" + "3: br 1b\n" + ".previous" + : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64) + : "r" ((long)m), "Ir" (old), "1" (new) : "memory"); + + return prev; +} + +static inline unsigned long +____cmpxchg(_u32, volatile int *m, int old, int new) +{ + unsigned long prev, cmp; + + __asm__ __volatile__( + "1: ldl_l %0,%5\n" + " cmpeq %0,%3,%1\n" + " beq %1,2f\n" + " mov %4,%1\n" + " stl_c %1,%2\n" + " beq %1,3f\n" + __ASM__MB + "2:\n" + ".subsection 2\n" + "3: br 1b\n" + ".previous" + : "=&r"(prev), "=&r"(cmp), "=m"(*m) + : "r"((long) old), "r"(new), "m"(*m) : "memory"); + + return prev; +} + +static inline unsigned long +____cmpxchg(_u64, volatile long *m, unsigned long old, unsigned long new) +{ + unsigned long prev, cmp; + + __asm__ __volatile__( + "1: ldq_l %0,%5\n" + " cmpeq %0,%3,%1\n" + " beq %1,2f\n" + " mov %4,%1\n" + " stq_c %1,%2\n" + " beq %1,3f\n" + __ASM__MB + "2:\n" + ".subsection 2\n" + "3: br 1b\n" + ".previous" + : "=&r"(prev), "=&r"(cmp), "=m"(*m) + : "r"((long) old), "r"(new), "m"(*m) : "memory"); + + return prev; +} + +/* This function doesn't exist, so you'll get a linker error + if something tries to do an invalid cmpxchg(). */ +extern void __cmpxchg_called_with_bad_pointer(void); + +static __always_inline unsigned long +____cmpxchg(, volatile void *ptr, unsigned long old, unsigned long new, + int size) +{ + switch (size) { + case 1: + return ____cmpxchg(_u8, ptr, old, new); + case 2: + return ____cmpxchg(_u16, ptr, old, new); + case 4: + return ____cmpxchg(_u32, ptr, old, new); + case 8: + return ____cmpxchg(_u64, ptr, old, new); + } + __cmpxchg_called_with_bad_pointer(); + return old; +} + +#endif From 5f0e3da6e186598bbd2569410ab60fa645ba00c9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 31 Mar 2009 15:23:36 -0700 Subject: [PATCH 345/478] alpha: convert u64 to unsigned long long Convert alpha architecture to use u64 as unsigned long long. This is being done so that (a) all arches use u64 as unsigned long long and (b) printk of a u64 as %ll[ux] will not generate format warnings by gcc. The only gcc cross-compiler that I have is 4.0.2, which generates errors about miscompiling __weak references, so I have commented out that line in compiler-gcc4.h so that most of these compile, but more builds and real machine testing would be Real Good. [akpm@linux-foundation.org: fix warning] [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Randy Dunlap Cc: Richard Henderson Cc: Ivan Kokshaysky From: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/machvec.h | 2 +- arch/alpha/include/asm/types.h | 5 ++++ arch/alpha/kernel/err_ev6.c | 4 ++-- arch/alpha/kernel/err_ev7.c | 6 ++--- arch/alpha/kernel/err_marvel.c | 40 ++++++++++++++++---------------- arch/alpha/kernel/err_titan.c | 28 +++++++++++----------- arch/alpha/kernel/pci.c | 2 +- arch/alpha/kernel/pci_iommu.c | 34 +++++++++++++-------------- arch/alpha/kernel/proto.h | 16 ++++++------- arch/alpha/kernel/setup.c | 2 +- arch/alpha/kernel/smc37c669.c | 4 ++-- arch/alpha/kernel/sys_jensen.c | 3 +-- arch/alpha/kernel/sys_sable.c | 4 ++-- arch/alpha/kernel/traps.c | 2 +- 14 files changed, 78 insertions(+), 74 deletions(-) diff --git a/arch/alpha/include/asm/machvec.h b/arch/alpha/include/asm/machvec.h index fea4ea75b79d..13cd42743810 100644 --- a/arch/alpha/include/asm/machvec.h +++ b/arch/alpha/include/asm/machvec.h @@ -80,7 +80,7 @@ struct alpha_machine_vector void (*update_irq_hw)(unsigned long, unsigned long, int); void (*ack_irq)(unsigned long); void (*device_interrupt)(unsigned long vector); - void (*machine_check)(u64 vector, u64 la); + void (*machine_check)(unsigned long vector, unsigned long la); void (*smp_callin)(void); void (*init_arch)(void); diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h index c1541353ccef..f072f344497e 100644 --- a/arch/alpha/include/asm/types.h +++ b/arch/alpha/include/asm/types.h @@ -8,7 +8,12 @@ * not a major issue. However, for interoperability, libraries still * need to be careful to avoid a name clashes. */ + +#ifdef __KERNEL__ +#include +#else #include +#endif #ifndef __ASSEMBLY__ diff --git a/arch/alpha/kernel/err_ev6.c b/arch/alpha/kernel/err_ev6.c index 11aee012a8ae..985e5c1681ac 100644 --- a/arch/alpha/kernel/err_ev6.c +++ b/arch/alpha/kernel/err_ev6.c @@ -157,8 +157,8 @@ ev6_parse_cbox(u64 c_addr, u64 c1_syn, u64 c2_syn, err_print_prefix, streamname[stream], bitsname[bits], sourcename[source]); - printk("%s Address: 0x%016lx\n" - " Syndrome[upper.lower]: %02lx.%02lx\n", + printk("%s Address: 0x%016llx\n" + " Syndrome[upper.lower]: %02llx.%02llx\n", err_print_prefix, c_addr, c2_syn, c1_syn); diff --git a/arch/alpha/kernel/err_ev7.c b/arch/alpha/kernel/err_ev7.c index 68cd493f54c5..73770c6ca013 100644 --- a/arch/alpha/kernel/err_ev7.c +++ b/arch/alpha/kernel/err_ev7.c @@ -246,13 +246,13 @@ ev7_process_pal_subpacket(struct el_subpacket *header) switch(header->type) { case EL_TYPE__PAL__LOGOUT_FRAME: - printk("%s*** MCHK occurred on LPID %ld (RBOX %lx)\n", + printk("%s*** MCHK occurred on LPID %ld (RBOX %llx)\n", err_print_prefix, packet->by_type.logout.whami, packet->by_type.logout.rbox_whami); el_print_timestamp(&packet->by_type.logout.timestamp); - printk("%s EXC_ADDR: %016lx\n" - " HALT_CODE: %lx\n", + printk("%s EXC_ADDR: %016llx\n" + " HALT_CODE: %llx\n", err_print_prefix, packet->by_type.logout.exc_addr, packet->by_type.logout.halt_code); diff --git a/arch/alpha/kernel/err_marvel.c b/arch/alpha/kernel/err_marvel.c index 413bf37eb094..6bfd243efba3 100644 --- a/arch/alpha/kernel/err_marvel.c +++ b/arch/alpha/kernel/err_marvel.c @@ -129,7 +129,7 @@ marvel_print_po7_crrct_sym(u64 crrct_sym) printk("%s Correctable Error Symptoms:\n" - "%s Syndrome: 0x%lx\n", + "%s Syndrome: 0x%llx\n", err_print_prefix, err_print_prefix, EXTRACT(crrct_sym, IO7__PO7_CRRCT_SYM__SYN)); marvel_print_err_cyc(EXTRACT(crrct_sym, IO7__PO7_CRRCT_SYM__ERR_CYC)); @@ -186,7 +186,7 @@ marvel_print_po7_uncrr_sym(u64 uncrr_sym, u64 valid_mask) uncrr_sym &= valid_mask; if (EXTRACT(valid_mask, IO7__PO7_UNCRR_SYM__SYN)) - printk("%s Syndrome: 0x%lx\n", + printk("%s Syndrome: 0x%llx\n", err_print_prefix, EXTRACT(uncrr_sym, IO7__PO7_UNCRR_SYM__SYN)); @@ -307,7 +307,7 @@ marvel_print_po7_ugbge_sym(u64 ugbge_sym) sprintf(opcode_str, "BlkIO"); break; default: - sprintf(opcode_str, "0x%lx\n", + sprintf(opcode_str, "0x%llx\n", EXTRACT(ugbge_sym, IO7__PO7_UGBGE_SYM__UPH_OPCODE)); break; } @@ -321,7 +321,7 @@ marvel_print_po7_ugbge_sym(u64 ugbge_sym) opcode_str); if (0xC5 != EXTRACT(ugbge_sym, IO7__PO7_UGBGE_SYM__UPH_OPCODE)) - printk("%s Packet Offset 0x%08lx\n", + printk("%s Packet Offset 0x%08llx\n", err_print_prefix, EXTRACT(ugbge_sym, IO7__PO7_UGBGE_SYM__UPH_PKT_OFF)); } @@ -480,8 +480,8 @@ marvel_print_po7_err_sum(struct ev7_pal_io_subpacket *io) printk("%s Lost Error\n", err_print_prefix); printk("%s Failing Packet:\n" - "%s Cycle 1: %016lx\n" - "%s Cycle 2: %016lx\n", + "%s Cycle 1: %016llx\n" + "%s Cycle 2: %016llx\n", err_print_prefix, err_print_prefix, io->po7_err_pkt0, err_print_prefix, io->po7_err_pkt1); @@ -515,9 +515,9 @@ marvel_print_pox_tlb_err(u64 tlb_err) if (!(tlb_err & IO7__POX_TLBERR__ERR_VALID)) return; - printk("%s TLB Error on index 0x%lx:\n" + printk("%s TLB Error on index 0x%llx:\n" "%s - %s\n" - "%s - Addr: 0x%016lx\n", + "%s - Addr: 0x%016llx\n", err_print_prefix, EXTRACT(tlb_err, IO7__POX_TLBERR__ERR_TLB_PTR), err_print_prefix, @@ -579,7 +579,7 @@ marvel_print_pox_spl_cmplt(u64 spl_cmplt) sprintf(message, "Uncorrectable Split Write Data Error"); break; default: - sprintf(message, "%08lx\n", + sprintf(message, "%08llx\n", EXTRACT(spl_cmplt, IO7__POX_SPLCMPLT__MESSAGE)); break; } @@ -620,9 +620,9 @@ marvel_print_pox_trans_sum(u64 trans_sum) return; printk("%s Transaction Summary:\n" - "%s Command: 0x%lx - %s\n" - "%s Address: 0x%016lx%s\n" - "%s PCI-X Master Slot: 0x%lx\n", + "%s Command: 0x%llx - %s\n" + "%s Address: 0x%016llx%s\n" + "%s PCI-X Master Slot: 0x%llx\n", err_print_prefix, err_print_prefix, EXTRACT(trans_sum, IO7__POX_TRANSUM__PCIX_CMD), @@ -964,12 +964,12 @@ marvel_process_io_error(struct ev7_lf_subpackets *lf_subpackets, int print) #if 0 printk("%s PORT 7 ERROR:\n" - "%s PO7_ERROR_SUM: %016lx\n" - "%s PO7_UNCRR_SYM: %016lx\n" - "%s PO7_CRRCT_SYM: %016lx\n" - "%s PO7_UGBGE_SYM: %016lx\n" - "%s PO7_ERR_PKT0: %016lx\n" - "%s PO7_ERR_PKT1: %016lx\n", + "%s PO7_ERROR_SUM: %016llx\n" + "%s PO7_UNCRR_SYM: %016llx\n" + "%s PO7_CRRCT_SYM: %016llx\n" + "%s PO7_UGBGE_SYM: %016llx\n" + "%s PO7_ERR_PKT0: %016llx\n" + "%s PO7_ERR_PKT1: %016llx\n", err_print_prefix, err_print_prefix, io->po7_error_sum, err_print_prefix, io->po7_uncrr_sym, @@ -987,12 +987,12 @@ marvel_process_io_error(struct ev7_lf_subpackets *lf_subpackets, int print) if (!MARVEL_IO_ERR_VALID(io->ports[i].pox_err_sum)) continue; - printk("%s PID %u PORT %d POx_ERR_SUM: %016lx\n", + printk("%s PID %u PORT %d POx_ERR_SUM: %016llx\n", err_print_prefix, lf_subpackets->io_pid, i, io->ports[i].pox_err_sum); marvel_print_pox_err(io->ports[i].pox_err_sum, &io->ports[i]); - printk("%s [ POx_FIRST_ERR: %016lx ]\n", + printk("%s [ POx_FIRST_ERR: %016llx ]\n", err_print_prefix, io->ports[i].pox_first_err); marvel_print_pox_err(io->ports[i].pox_first_err, &io->ports[i]); diff --git a/arch/alpha/kernel/err_titan.c b/arch/alpha/kernel/err_titan.c index 257449ed15ef..c7e28a88d6e3 100644 --- a/arch/alpha/kernel/err_titan.c +++ b/arch/alpha/kernel/err_titan.c @@ -107,12 +107,12 @@ titan_parse_p_serror(int which, u64 serror, int print) if (!print) return status; - printk("%s PChip %d SERROR: %016lx\n", + printk("%s PChip %d SERROR: %016llx\n", err_print_prefix, which, serror); if (serror & TITAN__PCHIP_SERROR__ECCMASK) { printk("%s %sorrectable ECC Error:\n" " Source: %-6s Command: %-8s Syndrome: 0x%08x\n" - " Address: 0x%lx\n", + " Address: 0x%llx\n", err_print_prefix, (serror & TITAN__PCHIP_SERROR__UECC) ? "Unc" : "C", serror_src[EXTRACT(serror, TITAN__PCHIP_SERROR__SRC)], @@ -223,7 +223,7 @@ titan_parse_p_perror(int which, int port, u64 perror, int print) if (!print) return status; - printk("%s PChip %d %cPERROR: %016lx\n", + printk("%s PChip %d %cPERROR: %016llx\n", err_print_prefix, which, port ? 'A' : 'G', perror); if (perror & TITAN__PCHIP_PERROR__IPTPW) @@ -316,7 +316,7 @@ titan_parse_p_agperror(int which, u64 agperror, int print) addr = EXTRACT(agperror, TITAN__PCHIP_AGPERROR__ADDR) << 3; len = EXTRACT(agperror, TITAN__PCHIP_AGPERROR__LEN); - printk("%s PChip %d AGPERROR: %016lx\n", err_print_prefix, + printk("%s PChip %d AGPERROR: %016llx\n", err_print_prefix, which, agperror); if (agperror & TITAN__PCHIP_AGPERROR__NOWINDOW) printk("%s No Window\n", err_print_prefix); @@ -597,16 +597,16 @@ privateer_process_680_frame(struct el_common *mchk_header, int print) return status; /* TODO - decode instead of just dumping... */ - printk("%s Summary Flags: %016lx\n" - " CChip DIRx: %016lx\n" - " System Management IR: %016lx\n" - " CPU IR: %016lx\n" - " Power Supply IR: %016lx\n" - " LM78 Fault Status: %016lx\n" - " System Doors: %016lx\n" - " Temperature Warning: %016lx\n" - " Fan Control: %016lx\n" - " Fatal Power Down Code: %016lx\n", + printk("%s Summary Flags: %016llx\n" + " CChip DIRx: %016llx\n" + " System Management IR: %016llx\n" + " CPU IR: %016llx\n" + " Power Supply IR: %016llx\n" + " LM78 Fault Status: %016llx\n" + " System Doors: %016llx\n" + " Temperature Warning: %016llx\n" + " Fan Control: %016llx\n" + " Fatal Power Down Code: %016llx\n", err_print_prefix, emchk->summary, emchk->c_dirx, diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index a3b938811400..a91ba28999b5 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -168,7 +168,7 @@ pcibios_align_resource(void *data, struct resource *res, */ /* Align to multiple of size of minimum base. */ - alignto = max(0x1000UL, align); + alignto = max_t(resource_size_t, 0x1000, align); start = ALIGN(start, alignto); if (hose->sparse_mem_base && size <= 7 * 16*MB) { if (((start / (16*MB)) & 0x7) == 0) { diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index b9094da05d7a..bfb880af959d 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -247,7 +247,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, && paddr + size <= __direct_map_size) { ret = paddr + __direct_map_base; - DBGA2("pci_map_single: [%p,%lx] -> direct %lx from %p\n", + DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %p\n", cpu_addr, size, ret, __builtin_return_address(0)); return ret; @@ -258,7 +258,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, if (dac_allowed) { ret = paddr + alpha_mv.pci_dac_offset; - DBGA2("pci_map_single: [%p,%lx] -> DAC %lx from %p\n", + DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %p\n", cpu_addr, size, ret, __builtin_return_address(0)); return ret; @@ -299,7 +299,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, ret = arena->dma_base + dma_ofs * PAGE_SIZE; ret += (unsigned long)cpu_addr & ~PAGE_MASK; - DBGA2("pci_map_single: [%p,%lx] np %ld -> sg %lx from %p\n", + DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %p\n", cpu_addr, size, npages, ret, __builtin_return_address(0)); return ret; @@ -355,14 +355,14 @@ pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, size_t size, && dma_addr < __direct_map_base + __direct_map_size) { /* Nothing to do. */ - DBGA2("pci_unmap_single: direct [%lx,%lx] from %p\n", + DBGA2("pci_unmap_single: direct [%llx,%zx] from %p\n", dma_addr, size, __builtin_return_address(0)); return; } if (dma_addr > 0xffffffff) { - DBGA2("pci64_unmap_single: DAC [%lx,%lx] from %p\n", + DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %p\n", dma_addr, size, __builtin_return_address(0)); return; } @@ -373,9 +373,9 @@ pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, size_t size, dma_ofs = (dma_addr - arena->dma_base) >> PAGE_SHIFT; if (dma_ofs * PAGE_SIZE >= arena->size) { - printk(KERN_ERR "Bogus pci_unmap_single: dma_addr %lx " - " base %lx size %x\n", dma_addr, arena->dma_base, - arena->size); + printk(KERN_ERR "Bogus pci_unmap_single: dma_addr %llx " + " base %llx size %x\n", + dma_addr, arena->dma_base, arena->size); return; BUG(); } @@ -394,7 +394,7 @@ pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, size_t size, spin_unlock_irqrestore(&arena->lock, flags); - DBGA2("pci_unmap_single: sg [%lx,%lx] np %ld from %p\n", + DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %p\n", dma_addr, size, npages, __builtin_return_address(0)); } EXPORT_SYMBOL(pci_unmap_single); @@ -444,7 +444,7 @@ try_again: goto try_again; } - DBGA2("pci_alloc_consistent: %lx -> [%p,%x] from %p\n", + DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %p\n", size, cpu_addr, *dma_addrp, __builtin_return_address(0)); return cpu_addr; @@ -464,7 +464,7 @@ pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu_addr, pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); free_pages((unsigned long)cpu_addr, get_order(size)); - DBGA2("pci_free_consistent: [%x,%lx] from %p\n", + DBGA2("pci_free_consistent: [%llx,%zx] from %p\n", dma_addr, size, __builtin_return_address(0)); } EXPORT_SYMBOL(pci_free_consistent); @@ -551,7 +551,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end, out->dma_address = paddr + __direct_map_base; out->dma_length = size; - DBGA(" sg_fill: [%p,%lx] -> direct %lx\n", + DBGA(" sg_fill: [%p,%lx] -> direct %llx\n", __va(paddr), size, out->dma_address); return 0; @@ -563,7 +563,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end, out->dma_address = paddr + alpha_mv.pci_dac_offset; out->dma_length = size; - DBGA(" sg_fill: [%p,%lx] -> DAC %lx\n", + DBGA(" sg_fill: [%p,%lx] -> DAC %llx\n", __va(paddr), size, out->dma_address); return 0; @@ -589,7 +589,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end, out->dma_address = arena->dma_base + dma_ofs*PAGE_SIZE + paddr; out->dma_length = size; - DBGA(" sg_fill: [%p,%lx] -> sg %lx np %ld\n", + DBGA(" sg_fill: [%p,%lx] -> sg %llx np %ld\n", __va(paddr), size, out->dma_address, npages); /* All virtually contiguous. We need to find the length of each @@ -752,7 +752,7 @@ pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents, if (addr > 0xffffffff) { /* It's a DAC address -- nothing to do. */ - DBGA(" (%ld) DAC [%lx,%lx]\n", + DBGA(" (%ld) DAC [%llx,%zx]\n", sg - end + nents, addr, size); continue; } @@ -760,12 +760,12 @@ pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents, if (addr >= __direct_map_base && addr < __direct_map_base + __direct_map_size) { /* Nothing to do. */ - DBGA(" (%ld) direct [%lx,%lx]\n", + DBGA(" (%ld) direct [%llx,%zx]\n", sg - end + nents, addr, size); continue; } - DBGA(" (%ld) sg [%lx,%lx]\n", + DBGA(" (%ld) sg [%llx,%zx]\n", sg - end + nents, addr, size); npages = iommu_num_pages(addr, size, PAGE_SIZE); diff --git a/arch/alpha/kernel/proto.h b/arch/alpha/kernel/proto.h index fe14c6747cd6..567f2598d090 100644 --- a/arch/alpha/kernel/proto.h +++ b/arch/alpha/kernel/proto.h @@ -20,7 +20,7 @@ struct pci_controller; extern struct pci_ops apecs_pci_ops; extern void apecs_init_arch(void); extern void apecs_pci_clr_err(void); -extern void apecs_machine_check(u64, u64); +extern void apecs_machine_check(unsigned long vector, unsigned long la_ptr); extern void apecs_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); /* core_cia.c */ @@ -29,7 +29,7 @@ extern void cia_init_pci(void); extern void cia_init_arch(void); extern void pyxis_init_arch(void); extern void cia_kill_arch(int); -extern void cia_machine_check(u64, u64); +extern void cia_machine_check(unsigned long vector, unsigned long la_ptr); extern void cia_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); /* core_irongate.c */ @@ -42,7 +42,7 @@ extern void irongate_machine_check(u64, u64); /* core_lca.c */ extern struct pci_ops lca_pci_ops; extern void lca_init_arch(void); -extern void lca_machine_check(u64, u64); +extern void lca_machine_check(unsigned long vector, unsigned long la_ptr); extern void lca_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); /* core_marvel.c */ @@ -64,7 +64,7 @@ void io7_clear_errors(struct io7 *io7); extern struct pci_ops mcpcia_pci_ops; extern void mcpcia_init_arch(void); extern void mcpcia_init_hoses(void); -extern void mcpcia_machine_check(u64, u64); +extern void mcpcia_machine_check(unsigned long vector, unsigned long la_ptr); extern void mcpcia_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); /* core_polaris.c */ @@ -72,14 +72,14 @@ extern struct pci_ops polaris_pci_ops; extern int polaris_read_config_dword(struct pci_dev *, int, u32 *); extern int polaris_write_config_dword(struct pci_dev *, int, u32); extern void polaris_init_arch(void); -extern void polaris_machine_check(u64, u64); +extern void polaris_machine_check(unsigned long vector, unsigned long la_ptr); #define polaris_pci_tbi ((void *)0) /* core_t2.c */ extern struct pci_ops t2_pci_ops; extern void t2_init_arch(void); extern void t2_kill_arch(int); -extern void t2_machine_check(u64, u64); +extern void t2_machine_check(unsigned long vector, unsigned long la_ptr); extern void t2_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); /* core_titan.c */ @@ -94,14 +94,14 @@ extern struct _alpha_agp_info *titan_agp_info(void); extern struct pci_ops tsunami_pci_ops; extern void tsunami_init_arch(void); extern void tsunami_kill_arch(int); -extern void tsunami_machine_check(u64, u64); +extern void tsunami_machine_check(unsigned long vector, unsigned long la_ptr); extern void tsunami_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); /* core_wildfire.c */ extern struct pci_ops wildfire_pci_ops; extern void wildfire_init_arch(void); extern void wildfire_kill_arch(int); -extern void wildfire_machine_check(u64, u64); +extern void wildfire_machine_check(unsigned long vector, unsigned long la_ptr); extern void wildfire_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); extern int wildfire_pa_to_nid(unsigned long); extern int wildfire_cpuid_to_nid(int); diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 02bee6983ce2..80df86cd746b 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -1255,7 +1255,7 @@ show_cpuinfo(struct seq_file *f, void *slot) platform_string(), nr_processors); #ifdef CONFIG_SMP - seq_printf(f, "cpus active\t\t: %d\n" + seq_printf(f, "cpus active\t\t: %u\n" "cpu active mask\t\t: %016lx\n", num_online_cpus(), cpus_addr(cpu_possible_map)[0]); #endif diff --git a/arch/alpha/kernel/smc37c669.c b/arch/alpha/kernel/smc37c669.c index fd467b207f0f..bca5bda90cde 100644 --- a/arch/alpha/kernel/smc37c669.c +++ b/arch/alpha/kernel/smc37c669.c @@ -2542,8 +2542,8 @@ void __init SMC669_Init ( int index ) SMC37c669_display_device_info( ); #endif local_irq_restore(flags); - printk( "SMC37c669 Super I/O Controller found @ 0x%lx\n", - (unsigned long) SMC_base ); + printk( "SMC37c669 Super I/O Controller found @ 0x%p\n", + SMC_base ); } else { local_irq_restore(flags); diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index e2516f9a8967..2b5caf3d9b15 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -244,12 +244,11 @@ jensen_init_arch(void) } static void -jensen_machine_check (u64 vector, u64 la) +jensen_machine_check(unsigned long vector, unsigned long la) { printk(KERN_CRIT "Machine check\n"); } - /* * The System Vector */ diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index d232e42be018..9e263256a42d 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -453,7 +453,7 @@ sable_lynx_enable_irq(unsigned int irq) sable_lynx_irq_swizzle->update_irq_hw(bit, mask); spin_unlock(&sable_lynx_irq_lock); #if 0 - printk("%s: mask 0x%lx bit 0x%x irq 0x%x\n", + printk("%s: mask 0x%lx bit 0x%lx irq 0x%x\n", __func__, mask, bit, irq); #endif } @@ -469,7 +469,7 @@ sable_lynx_disable_irq(unsigned int irq) sable_lynx_irq_swizzle->update_irq_hw(bit, mask); spin_unlock(&sable_lynx_irq_lock); #if 0 - printk("%s: mask 0x%lx bit 0x%x irq 0x%x\n", + printk("%s: mask 0x%lx bit 0x%lx irq 0x%x\n", __func__, mask, bit, irq); #endif } diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index cefc5a355ef9..6ee7655b7568 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -623,7 +623,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg, } lock_kernel(); - printk("Bad unaligned kernel access at %016lx: %p %lx %ld\n", + printk("Bad unaligned kernel access at %016lx: %p %lx %lu\n", pc, va, opcode, reg); do_exit(SIGSEGV); From a8af78982ff4c0b3731527b0217d286a343a3089 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 31 Mar 2009 15:23:37 -0700 Subject: [PATCH 346/478] pm: rework includes, remove arch ifdefs Make the following header file changes: - remove arch ifdefs and asm/suspend.h from linux/suspend.h - add asm/suspend.h to disk.c (for arch_prepare_suspend()) - add linux/io.h to swsusp.c (for ioremap()) - x86 32/64 bit compile fixes Signed-off-by: Magnus Damm Cc: Paul Mundt Acked-by: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/kernel/asm-offsets_64.c | 1 + arch/x86/power/cpu_32.c | 1 + arch/x86/power/cpu_64.c | 1 + arch/x86/power/hibernate_64.c | 1 + include/linux/suspend.h | 3 --- kernel/power/disk.c | 1 + kernel/power/swsusp.c | 1 + 8 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index fbf2f33e3080..5a6aa1c1162f 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -18,6 +18,7 @@ #include #include #include +#include #include diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 8793ab33e2c1..e72f062fb4b5 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 274d06082f48..ce702c5b3a2c 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -12,6 +12,7 @@ #include #include #include +#include static struct saved_context saved_context; diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index e3b6cf70d62c..5343540f2607 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -15,6 +15,7 @@ #include #include #include +#include static void fix_processor_context(void); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 6dd000dd7933..65fdc86e923f 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -14,6 +14,7 @@ #include #include #include +#include /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index c7d9bb1832ba..3e3a4364cbff 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -1,9 +1,6 @@ #ifndef _LINUX_SUSPEND_H #define _LINUX_SUSPEND_H -#if defined(CONFIG_X86) || defined(CONFIG_FRV) || defined(CONFIG_PPC32) || defined(CONFIG_PPC64) -#include -#endif #include #include #include diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e886d1332a10..f3db382c2b2d 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "power.h" diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 1ee6636414b2..78c35047586d 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "power.h" From bf9ed57d35d64dac5d5651478b5530a89b20ea1e Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 31 Mar 2009 15:23:38 -0700 Subject: [PATCH 347/478] pm: cleanup includes Remove unused/duplicate cruft from asm/suspend.h: - x86_32: remove unused acpi code - powerpc: remove duplicate prototypes, see linux/suspend.h Signed-off-by: Magnus Damm Cc: Paul Mundt Acked-by: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/suspend.h | 3 --- arch/x86/include/asm/suspend_32.h | 24 ------------------------ 2 files changed, 27 deletions(-) diff --git a/arch/powerpc/include/asm/suspend.h b/arch/powerpc/include/asm/suspend.h index cbf2c9404c37..c6efc3466aa6 100644 --- a/arch/powerpc/include/asm/suspend.h +++ b/arch/powerpc/include/asm/suspend.h @@ -3,7 +3,4 @@ static inline int arch_prepare_suspend(void) { return 0; } -void save_processor_state(void); -void restore_processor_state(void); - #endif /* __ASM_POWERPC_SUSPEND_H */ diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index a5074bd0f8be..48dcfa62ea07 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -24,28 +24,4 @@ struct saved_context { unsigned long return_address; } __attribute__((packed)); -#ifdef CONFIG_ACPI -extern unsigned long saved_eip; -extern unsigned long saved_esp; -extern unsigned long saved_ebp; -extern unsigned long saved_ebx; -extern unsigned long saved_esi; -extern unsigned long saved_edi; - -static inline void acpi_save_register_state(unsigned long return_point) -{ - saved_eip = return_point; - asm volatile("movl %%esp,%0" : "=m" (saved_esp)); - asm volatile("movl %%ebp,%0" : "=m" (saved_ebp)); - asm volatile("movl %%ebx,%0" : "=m" (saved_ebx)); - asm volatile("movl %%edi,%0" : "=m" (saved_edi)); - asm volatile("movl %%esi,%0" : "=m" (saved_esi)); -} - -#define acpi_restore_register_state() do {} while (0) - -/* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -#endif - #endif /* _ASM_X86_SUSPEND_32_H */ From 792dd4fc317e1f94a6af111a0979c1c0d8c14453 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Mar 2009 15:23:39 -0700 Subject: [PATCH 348/478] ubd: stop defintining MAJOR_NR MAJOR_NR isn't needed anymore since very early 2.5 kernels. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/drivers/ubd_kern.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 0a868118cf06..d42f826a8ab9 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -17,7 +17,6 @@ * James McMechan */ -#define MAJOR_NR UBD_MAJOR #define UBD_SHIFT 4 #include "linux/kernel.h" @@ -115,7 +114,7 @@ static struct block_device_operations ubd_blops = { }; /* Protected by ubd_lock */ -static int fake_major = MAJOR_NR; +static int fake_major = UBD_MAJOR; static struct gendisk *ubd_gendisk[MAX_DEV]; static struct gendisk *fake_gendisk[MAX_DEV]; @@ -299,7 +298,7 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out) } mutex_lock(&ubd_lock); - if(fake_major != MAJOR_NR){ + if (fake_major != UBD_MAJOR) { *error_out = "Can't assign a fake major twice"; goto out1; } @@ -818,13 +817,13 @@ static int ubd_disk_register(int major, u64 size, int unit, disk->first_minor = unit << UBD_SHIFT; disk->fops = &ubd_blops; set_capacity(disk, size / 512); - if(major == MAJOR_NR) + if (major == UBD_MAJOR) sprintf(disk->disk_name, "ubd%c", 'a' + unit); else sprintf(disk->disk_name, "ubd_fake%d", unit); /* sysfs register (not for ide fake devices) */ - if (major == MAJOR_NR) { + if (major == UBD_MAJOR) { ubd_devs[unit].pdev.id = unit; ubd_devs[unit].pdev.name = DRIVER_NAME; ubd_devs[unit].pdev.dev.release = ubd_device_release; @@ -871,13 +870,13 @@ static int ubd_add(int n, char **error_out) ubd_dev->queue->queuedata = ubd_dev; blk_queue_max_hw_segments(ubd_dev->queue, MAX_SG); - err = ubd_disk_register(MAJOR_NR, ubd_dev->size, n, &ubd_gendisk[n]); + err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); if(err){ *error_out = "Failed to register device"; goto out_cleanup; } - if(fake_major != MAJOR_NR) + if (fake_major != UBD_MAJOR) ubd_disk_register(fake_major, ubd_dev->size, n, &fake_gendisk[n]); @@ -1059,10 +1058,10 @@ static int __init ubd_init(void) char *error; int i, err; - if (register_blkdev(MAJOR_NR, "ubd")) + if (register_blkdev(UBD_MAJOR, "ubd")) return -1; - if (fake_major != MAJOR_NR) { + if (fake_major != UBD_MAJOR) { char name[sizeof("ubd_nnn\0")]; snprintf(name, sizeof(name), "ubd_%d", fake_major); From dc71768742b39bca298e9ca6c91e575cd4b140e6 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2009 15:23:40 -0700 Subject: [PATCH 349/478] uml: don't use a too long string literal uml uses a concatenated string literal to store the contents of .config, but .config file content is varaible, it can be very long. Use an array of string literals instead. Signed-off-by: WANG Cong Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/Makefile | 6 +++--- arch/um/kernel/config.c.in | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 499e5e95e609..388ec0a3ea9b 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -28,7 +28,7 @@ $(obj)/config.tmp: $(objtree)/.config FORCE $(call if_changed,quote1) quiet_cmd_quote1 = QUOTE $@ - cmd_quote1 = sed -e 's/"/\\"/g' -e 's/^/"/' -e 's/$$/\\n"/' \ + cmd_quote1 = sed -e 's/"/\\"/g' -e 's/^/"/' -e 's/$$/\\n",/' \ $< > $@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE @@ -36,9 +36,9 @@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE quiet_cmd_quote2 = QUOTE $@ cmd_quote2 = sed -e '/CONFIG/{' \ - -e 's/"CONFIG"\;/""/' \ + -e 's/"CONFIG"//' \ -e 'r $(obj)/config.tmp' \ -e 'a \' \ - -e '""\;' \ + -e '""' \ -e '}' \ $< > $@ diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in index c062cbfe386e..bee154d1ba75 100644 --- a/arch/um/kernel/config.c.in +++ b/arch/um/kernel/config.c.in @@ -7,11 +7,15 @@ #include #include "init.h" -static __initdata char *config = "CONFIG"; +static __initdata const char *config[] = { +"CONFIG" +}; static int __init print_config(char *line, int *add) { - printf("%s", config); + int i; + for (i = 0; i < sizeof(config)/sizeof(config[0]); i++) + printf("%s", config[i]); exit(0); } From 5062910a06ee979002edbf58ab65481c81242df4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2009 15:23:41 -0700 Subject: [PATCH 350/478] uml: improve error messages These error messages are from check_sysemu(), not check_ptrace(). Signed-off-by: WANG Cong Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/os-Linux/start_up.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 183db26d01bf..02ee9adff54a 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -244,7 +244,7 @@ static void __init check_sysemu(void) if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *) PTRACE_O_TRACESYSGOOD) < 0)) - fatal_perror("check_ptrace: PTRACE_OLDSETOPTIONS failed"); + fatal_perror("check_sysemu: PTRACE_OLDSETOPTIONS failed"); while (1) { count++; @@ -252,12 +252,12 @@ static void __init check_sysemu(void) goto fail; CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); if (n < 0) - fatal_perror("check_ptrace : wait failed"); + fatal_perror("check_sysemu: wait failed"); if (WIFSTOPPED(status) && (WSTOPSIG(status) == (SIGTRAP|0x80))) { if (!count) { - non_fatal("check_ptrace : SYSEMU_SINGLESTEP " + non_fatal("check_sysemu: SYSEMU_SINGLESTEP " "doesn't singlestep"); goto fail; } @@ -271,7 +271,7 @@ static void __init check_sysemu(void) else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP)) count++; else { - non_fatal("check_ptrace : expected SIGTRAP or " + non_fatal("check_sysemu: expected SIGTRAP or " "(SIGTRAP | 0x80), got status = %d\n", status); goto fail; From 65bd6a9bc7be3f5841dad12f77ce4b3210bd82c5 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2009 15:23:41 -0700 Subject: [PATCH 351/478] uml: remove useless comments These comments are useless now, remove them. Signed-off-by: WANG Cong Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/drivers/pcap_user.h | 10 ---------- arch/um/drivers/port.h | 10 ---------- arch/um/drivers/ssl.h | 10 ---------- arch/um/drivers/stdio_console.h | 10 ---------- arch/um/drivers/xterm.h | 10 ---------- arch/um/include/asm/irq_vectors.h | 10 ---------- arch/um/include/asm/mmu.h | 10 ---------- arch/um/include/asm/pda.h | 10 ---------- arch/um/include/asm/pgalloc.h | 10 ---------- arch/um/include/asm/pgtable-3level.h | 10 ---------- arch/um/include/shared/frame_kern.h | 10 ---------- arch/um/include/shared/initrd.h | 10 ---------- arch/um/include/shared/irq_kern.h | 10 ---------- arch/um/include/shared/mem_kern.h | 10 ---------- arch/um/include/shared/ubd_user.h | 10 ---------- arch/um/kernel/config.c.in | 10 ---------- arch/um/sys-i386/asm/archparam.h | 10 ---------- arch/um/sys-i386/shared/sysdep/checksum.h | 10 ---------- arch/um/sys-ia64/sysdep/ptrace.h | 10 ---------- arch/um/sys-ia64/sysdep/sigcontext.h | 10 ---------- arch/um/sys-ia64/sysdep/syscalls.h | 10 ---------- arch/um/sys-ppc/miscthings.c | 11 ----------- arch/um/sys-ppc/ptrace.c | 10 ---------- arch/um/sys-ppc/ptrace_user.c | 10 ---------- arch/um/sys-ppc/shared/sysdep/ptrace.h | 10 ---------- arch/um/sys-ppc/shared/sysdep/sigcontext.h | 10 ---------- arch/um/sys-ppc/shared/sysdep/syscalls.h | 10 ---------- arch/um/sys-ppc/sigcontext.c | 10 ---------- arch/um/sys-x86_64/asm/archparam.h | 10 ---------- arch/um/sys-x86_64/asm/module.h | 10 ---------- arch/um/sys-x86_64/mem.c | 9 --------- 31 files changed, 310 deletions(-) diff --git a/arch/um/drivers/pcap_user.h b/arch/um/drivers/pcap_user.h index 96b80b565eeb..d8ba6153f912 100644 --- a/arch/um/drivers/pcap_user.h +++ b/arch/um/drivers/pcap_user.h @@ -19,13 +19,3 @@ extern const struct net_user_info pcap_user_info; extern int pcap_user_read(int fd, void *buf, int len, struct pcap_data *pri); -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/drivers/port.h b/arch/um/drivers/port.h index 9117609a575d..372a80c0556a 100644 --- a/arch/um/drivers/port.h +++ b/arch/um/drivers/port.h @@ -18,13 +18,3 @@ extern void port_remove_dev(void *d); #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/drivers/ssl.h b/arch/um/drivers/ssl.h index 98412aa66607..314d17725ce6 100644 --- a/arch/um/drivers/ssl.h +++ b/arch/um/drivers/ssl.h @@ -11,13 +11,3 @@ extern void ssl_receive_char(int line, char ch); #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/drivers/stdio_console.h b/arch/um/drivers/stdio_console.h index 505a3d5bea5e..6d8275f71fd4 100644 --- a/arch/um/drivers/stdio_console.h +++ b/arch/um/drivers/stdio_console.h @@ -9,13 +9,3 @@ extern void save_console_flags(void); #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/drivers/xterm.h b/arch/um/drivers/xterm.h index f33a6e77b186..56b9c4aba423 100644 --- a/arch/um/drivers/xterm.h +++ b/arch/um/drivers/xterm.h @@ -10,13 +10,3 @@ extern int xterm_fd(int socket, int *pid_out); #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/asm/irq_vectors.h b/arch/um/include/asm/irq_vectors.h index 62ddba6fc733..272a81e0ce14 100644 --- a/arch/um/include/asm/irq_vectors.h +++ b/arch/um/include/asm/irq_vectors.h @@ -8,13 +8,3 @@ #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index 2cf35c21d694..cf259de51531 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -10,13 +10,3 @@ #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/asm/pda.h b/arch/um/include/asm/pda.h index 0d8bf33ffd42..ddcd774fc2a0 100644 --- a/arch/um/include/asm/pda.h +++ b/arch/um/include/asm/pda.h @@ -19,13 +19,3 @@ extern struct foo me; #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 9062a6e72241..718984359f8c 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -60,13 +60,3 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 0446f456b428..084de4a9fc70 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h @@ -134,13 +134,3 @@ static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot) #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/shared/frame_kern.h b/arch/um/include/shared/frame_kern.h index ce9514f57211..76078490c258 100644 --- a/arch/um/include/shared/frame_kern.h +++ b/arch/um/include/shared/frame_kern.h @@ -20,13 +20,3 @@ extern int setup_signal_stack_si(unsigned long stack_top, int sig, #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/shared/initrd.h b/arch/um/include/shared/initrd.h index 439b9a814985..22673bcc273d 100644 --- a/arch/um/include/shared/initrd.h +++ b/arch/um/include/shared/initrd.h @@ -10,13 +10,3 @@ extern int load_initrd(char *filename, void *buf, int size); #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/shared/irq_kern.h b/arch/um/include/shared/irq_kern.h index fba3895274f9..b05d22f3d84e 100644 --- a/arch/um/include/shared/irq_kern.h +++ b/arch/um/include/shared/irq_kern.h @@ -16,13 +16,3 @@ extern int um_request_irq(unsigned int irq, int fd, int type, #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/shared/mem_kern.h b/arch/um/include/shared/mem_kern.h index cb7e196d366b..69be0fd0ce4b 100644 --- a/arch/um/include/shared/mem_kern.h +++ b/arch/um/include/shared/mem_kern.h @@ -18,13 +18,3 @@ extern void register_remapper(struct remapper *info); #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/shared/ubd_user.h b/arch/um/include/shared/ubd_user.h index bb66517f0739..3845051f1b10 100644 --- a/arch/um/include/shared/ubd_user.h +++ b/arch/um/include/shared/ubd_user.h @@ -14,13 +14,3 @@ extern int kernel_fd; #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in index bee154d1ba75..b7a43feafde7 100644 --- a/arch/um/kernel/config.c.in +++ b/arch/um/kernel/config.c.in @@ -24,13 +24,3 @@ __uml_setup("--showconfig", print_config, " Prints the config file that this UML binary was generated from.\n\n" ); -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-i386/asm/archparam.h b/arch/um/sys-i386/asm/archparam.h index 93fd723344e5..2a18a884ca1b 100644 --- a/arch/um/sys-i386/asm/archparam.h +++ b/arch/um/sys-i386/asm/archparam.h @@ -14,13 +14,3 @@ #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-i386/shared/sysdep/checksum.h b/arch/um/sys-i386/shared/sysdep/checksum.h index 0cb4645cbeb8..ed47445f3905 100644 --- a/arch/um/sys-i386/shared/sysdep/checksum.h +++ b/arch/um/sys-i386/shared/sysdep/checksum.h @@ -199,13 +199,3 @@ static __inline__ __wsum csum_and_copy_to_user(const void *src, #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ia64/sysdep/ptrace.h b/arch/um/sys-ia64/sysdep/ptrace.h index 42dd8fb6f2f9..0f0f4e6fd334 100644 --- a/arch/um/sys-ia64/sysdep/ptrace.h +++ b/arch/um/sys-ia64/sysdep/ptrace.h @@ -14,13 +14,3 @@ struct sys_pt_regs { #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ia64/sysdep/sigcontext.h b/arch/um/sys-ia64/sysdep/sigcontext.h index f15fb25260ba..76b43161e779 100644 --- a/arch/um/sys-ia64/sysdep/sigcontext.h +++ b/arch/um/sys-ia64/sysdep/sigcontext.h @@ -8,13 +8,3 @@ #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ia64/sysdep/syscalls.h b/arch/um/sys-ia64/sysdep/syscalls.h index 4a1f46ef1ebc..5f6700c41558 100644 --- a/arch/um/sys-ia64/sysdep/syscalls.h +++ b/arch/um/sys-ia64/sysdep/syscalls.h @@ -8,13 +8,3 @@ #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ppc/miscthings.c b/arch/um/sys-ppc/miscthings.c index 373061c50129..1c11aed9c719 100644 --- a/arch/um/sys-ppc/miscthings.c +++ b/arch/um/sys-ppc/miscthings.c @@ -40,14 +40,3 @@ void shove_aux_table(unsigned long sp) } /* END stuff taken from arch/ppc/kernel/process.c */ - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ppc/ptrace.c b/arch/um/sys-ppc/ptrace.c index 8e71b47f2b8e..66ef155248f1 100644 --- a/arch/um/sys-ppc/ptrace.c +++ b/arch/um/sys-ppc/ptrace.c @@ -56,13 +56,3 @@ int peek_user(struct task_struct *child, long addr, long data) return put_user(tmp, (unsigned long *) data); } -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ppc/ptrace_user.c b/arch/um/sys-ppc/ptrace_user.c index ff0b9c077a13..224d2403c37b 100644 --- a/arch/um/sys-ppc/ptrace_user.c +++ b/arch/um/sys-ppc/ptrace_user.c @@ -27,13 +27,3 @@ int ptrace_setregs(long pid, unsigned long *regs_in) } return 0; } -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ppc/shared/sysdep/ptrace.h b/arch/um/sys-ppc/shared/sysdep/ptrace.h index df2397dba3e5..0e3230e937e1 100644 --- a/arch/um/sys-ppc/shared/sysdep/ptrace.h +++ b/arch/um/sys-ppc/shared/sysdep/ptrace.h @@ -91,13 +91,3 @@ extern void shove_aux_table(unsigned long sp); #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ppc/shared/sysdep/sigcontext.h b/arch/um/sys-ppc/shared/sysdep/sigcontext.h index f20d965de9c7..b7286f0a1e00 100644 --- a/arch/um/sys-ppc/shared/sysdep/sigcontext.h +++ b/arch/um/sys-ppc/shared/sysdep/sigcontext.h @@ -50,13 +50,3 @@ #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ppc/shared/sysdep/syscalls.h b/arch/um/sys-ppc/shared/sysdep/syscalls.h index 679df351e19b..1ff81552251c 100644 --- a/arch/um/sys-ppc/shared/sysdep/syscalls.h +++ b/arch/um/sys-ppc/shared/sysdep/syscalls.h @@ -41,13 +41,3 @@ int old_mmap(unsigned long addr, unsigned long len, #define LAST_ARCH_SYSCALL __NR_fadvise64 -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-ppc/sigcontext.c b/arch/um/sys-ppc/sigcontext.c index 4bdc15c89edd..40694d0f3d15 100644 --- a/arch/um/sys-ppc/sigcontext.c +++ b/arch/um/sys-ppc/sigcontext.c @@ -2,13 +2,3 @@ #include "asm/sigcontext.h" #include "sysdep/ptrace.h" -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-x86_64/asm/archparam.h b/arch/um/sys-x86_64/asm/archparam.h index 270ed9586b68..6c083663b8d9 100644 --- a/arch/um/sys-x86_64/asm/archparam.h +++ b/arch/um/sys-x86_64/asm/archparam.h @@ -14,13 +14,3 @@ #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-x86_64/asm/module.h b/arch/um/sys-x86_64/asm/module.h index 35b5491d3e96..8eb79c2d07d5 100644 --- a/arch/um/sys-x86_64/asm/module.h +++ b/arch/um/sys-x86_64/asm/module.h @@ -18,13 +18,3 @@ struct mod_arch_specific #endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/sys-x86_64/mem.c b/arch/um/sys-x86_64/mem.c index 3f59a0a4f156..3f8df8abf347 100644 --- a/arch/um/sys-x86_64/mem.c +++ b/arch/um/sys-x86_64/mem.c @@ -14,12 +14,3 @@ unsigned long vm_data_default_flags = __VM_DATA_DEFAULT_FLAGS; unsigned long vm_data_default_flags32 = __VM_DATA_DEFAULT_FLAGS; unsigned long vm_force_exec32 = PROT_EXEC; -/* Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ From 53d6660836f233df66490707365ab177e5fb2bb4 Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Tue, 31 Mar 2009 15:23:43 -0700 Subject: [PATCH 352/478] loop: add ioctl to resize a loop device Add the ability to 'resize' the loop device on the fly. One practical application is a loop file with XFS filesystem, already mounted: You can easily enlarge the file (append some bytes) and then call ioctl(fd, LOOP_SET_CAPACITY, new); The loop driver will learn about the new size and you can use xfs_growfs later on, which will allow you to use full capacity of the loop file without the need to unmount. Test app: #include #include #include #include #include #include #include #include #include #include #include #define _GNU_SOURCE #include char *me; void usage(FILE *f) { fprintf(f, "%s [options] loop_dev [backend_file]\n" "-s, --set new_size_in_bytes\n" "\twhen backend_file is given, " "it will be expanded too while keeping the original contents\n", me); } struct option opts[] = { { .name = "set", .has_arg = 1, .flag = NULL, .val = 's' }, { .name = "help", .has_arg = 0, .flag = NULL, .val = 'h' } }; void err_size(char *name, __u64 old) { fprintf(stderr, "size must be larger than current %s (%llu)\n", name, old); } int main(int argc, char *argv[]) { int fd, err, c, i, bfd; ssize_t ssz; size_t sz; __u64 old, new, append; char a[BUFSIZ]; struct stat st; FILE *out; char *backend, *dev; err = EINVAL; out = stderr; me = argv[0]; new = 0; while ((c = getopt_long(argc, argv, "s:h", opts, &i)) != -1) { switch (c) { case 's': errno = 0; new = strtoull(optarg, NULL, 0); if (errno) { err = errno; perror(argv[i]); goto out; } break; case 'h': err = 0; out = stdout; goto err; default: perror(argv[i]); goto err; } } if (optind < argc) dev = argv[optind++]; else goto err; fd = open(dev, O_RDONLY); if (fd < 0) { err = errno; perror(dev); goto out; } err = ioctl(fd, BLKGETSIZE64, &old); if (err) { err = errno; perror("ioctl BLKGETSIZE64"); goto out; } if (!new) { printf("%llu\n", old); goto out; } if (new < old) { err = EINVAL; err_size(dev, old); goto out; } if (optind < argc) { backend = argv[optind++]; bfd = open(backend, O_WRONLY|O_APPEND); if (bfd < 0) { err = errno; perror(backend); goto out; } err = fstat(bfd, &st); if (err) { err = errno; perror(backend); goto out; } if (new < st.st_size) { err = EINVAL; err_size(backend, st.st_size); goto out; } append = new - st.st_size; sz = sizeof(a); while (append > 0) { if (append < sz) sz = append; ssz = write(bfd, a, sz); if (ssz != sz) { err = errno; perror(backend); goto out; } append -= sz; } err = fsync(bfd); if (err) { err = errno; perror(backend); goto out; } } err = ioctl(fd, LOOP_SET_CAPACITY, new); if (err) { err = errno; perror("ioctl LOOP_SET_CAPACITY"); } goto out; err: usage(out); out: return err; } Signed-off-by: J. R. Okajima Signed-off-by: Tomas Matejicek Cc: Cc: Karel Zak Cc: Jens Axboe Cc: Al Viro Cc: Christoph Hellwig Cc: Akinobu Mita Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 30 ++++++++++++++++++++++++++++++ include/linux/loop.h | 1 + 2 files changed, 31 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2621ed2ce6d2..40b17d3b55a1 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1192,6 +1192,30 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { return err; } +static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) +{ + int err; + sector_t sec; + loff_t sz; + + err = -ENXIO; + if (unlikely(lo->lo_state != Lo_bound)) + goto out; + err = figure_loop_size(lo); + if (unlikely(err)) + goto out; + sec = get_capacity(lo->lo_disk); + /* the width of sector_t may be narrow for bit-shift */ + sz = sec; + sz <<= 9; + mutex_lock(&bdev->bd_mutex); + bd_set_size(bdev, sz); + mutex_unlock(&bdev->bd_mutex); + + out: + return err; +} + static int lo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1224,6 +1248,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_GET_STATUS64: err = loop_get_status64(lo, (struct loop_info64 __user *) arg); break; + case LOOP_SET_CAPACITY: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_capacity(lo, bdev); + break; default: err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; } @@ -1371,6 +1400,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, lo, (struct compat_loop_info __user *) arg); mutex_unlock(&lo->lo_ctl_mutex); break; + case LOOP_SET_CAPACITY: case LOOP_CLR_FD: case LOOP_GET_STATUS64: case LOOP_SET_STATUS64: diff --git a/include/linux/loop.h b/include/linux/loop.h index 6ffd6db5bb0d..40725447f5e0 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -160,5 +160,6 @@ int loop_unregister_transfer(int number); #define LOOP_SET_STATUS64 0x4C04 #define LOOP_GET_STATUS64 0x4C05 #define LOOP_CHANGE_FD 0x4C06 +#define LOOP_SET_CAPACITY 0x4C07 #endif From 55a63998b8967615a15e2211ba0ff3a84a565824 Mon Sep 17 00:00:00 2001 From: Wolfram Strepp Date: Tue, 31 Mar 2009 15:23:45 -0700 Subject: [PATCH 353/478] lib/rbtree.c: optimize rb_erase() Tfour 4 redundant if-conditions in function __rb_erase_color() in lib/rbtree.c are removed. In pseudo-source-code, the structure of the code is as follows: if ((!A || B) && (!C || D)) { . . . } else { if (!C || D) {//if this is true, it implies: (A == true) && (B == false) if (A) {//hence this always evaluates to 'true'... . } . //at this point, C always becomes true, because of: __rb_rotate_right/left(); //and: other = parent->rb_right/left; } . . if (C) {//...and this too ! . } } Signed-off-by: Wolfram Strepp Acked-by: Peter Zijlstra Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/lib/rbtree.c b/lib/rbtree.c index 9956b99649f0..f653659e0bc1 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -163,17 +163,14 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, { if (!other->rb_right || rb_is_black(other->rb_right)) { - struct rb_node *o_left; - if ((o_left = other->rb_left)) - rb_set_black(o_left); + rb_set_black(other->rb_left); rb_set_red(other); __rb_rotate_right(other, root); other = parent->rb_right; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); - if (other->rb_right) - rb_set_black(other->rb_right); + rb_set_black(other->rb_right); __rb_rotate_left(parent, root); node = root->rb_node; break; @@ -200,17 +197,14 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, { if (!other->rb_left || rb_is_black(other->rb_left)) { - register struct rb_node *o_right; - if ((o_right = other->rb_right)) - rb_set_black(o_right); + rb_set_black(other->rb_right); rb_set_red(other); __rb_rotate_left(other, root); other = parent->rb_left; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); - if (other->rb_left) - rb_set_black(other->rb_left); + rb_set_black(other->rb_left); __rb_rotate_right(parent, root); node = root->rb_node; break; From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 31 Mar 2009 15:23:46 -0700 Subject: [PATCH 354/478] filesystem freeze: allow SysRq emergency thaw to thaw frozen filesystems Now that the filesystem freeze operation has been elevated to the VFS, and is just an ioctl away, some sort of safety net for unintentionally frozen root filesystems may be in order. The timeout thaw originally proposed did not get merged, but perhaps something like this would be useful in emergencies. For example, freeze /path/to/mountpoint may freeze your root filesystem if you forgot that you had that unmounted. I chose 'j' as the last remaining character other than 'h' which is sort of reserved for help (because help is generated on any unknown character). I've tested this on a non-root fs with multiple (nested) freezers, as well as on a system rendered unresponsive due to a frozen root fs. [randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled] Signed-off-by: Eric Sandeen Cc: Takashi Sato Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysrq.txt | 5 +++++ drivers/char/sysrq.c | 19 ++++++++++++++++++- fs/buffer.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 9e592c718afb..afa2946892da 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -81,6 +81,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'i' - Send a SIGKILL to all processes, except for init. +'j' - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl. + 'k' - Secure Access Key (SAK) Kills all programs on the current virtual console. NOTE: See important comments below in SAK section. @@ -160,6 +162,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you are unable to kill any other way, especially if it's spawning other processes. +"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen +(probably root) filesystem via the FIFREEZE ioctl. + * Sometimes SysRq seems to get 'stuck' after using it, what can I do? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ That happens to me, also. I've found that tapping shift, alt, and control diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 33a9351c896d..5afe7316c72e 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -346,6 +346,19 @@ static struct sysrq_key_op sysrq_moom_op = { .enable_mask = SYSRQ_ENABLE_SIGNAL, }; +#ifdef CONFIG_BLOCK +static void sysrq_handle_thaw(int key, struct tty_struct *tty) +{ + emergency_thaw_all(); +} +static struct sysrq_key_op sysrq_thaw_op = { + .handler = sysrq_handle_thaw, + .help_msg = "thaw-filesystems(J)", + .action_msg = "Emergency Thaw of all frozen filesystems", + .enable_mask = SYSRQ_ENABLE_SIGNAL, +}; +#endif + static void sysrq_handle_kill(int key, struct tty_struct *tty) { send_sig_all(SIGKILL); @@ -396,9 +409,13 @@ static struct sysrq_key_op *sysrq_key_table[36] = { &sysrq_moom_op, /* f */ /* g: May be registered by ppc for kgdb */ NULL, /* g */ - NULL, /* h */ + NULL, /* h - reserved for help */ &sysrq_kill_op, /* i */ +#ifdef CONFIG_BLOCK + &sysrq_thaw_op, /* j */ +#else NULL, /* j */ +#endif &sysrq_SAK_op, /* k */ #ifdef CONFIG_SMP &sysrq_showallcpus_op, /* l */ diff --git a/fs/buffer.c b/fs/buffer.c index c77b848c3d43..f5f8b15a6e40 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -547,6 +547,39 @@ repeat: return err; } +void do_thaw_all(unsigned long unused) +{ + struct super_block *sb; + char b[BDEVNAME_SIZE]; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + pdflush_operation(do_thaw_all, 0); +} + /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written diff --git a/include/linux/fs.h b/include/linux/fs.h index 87e7bfc5ebd7..61211ad823fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1878,6 +1878,7 @@ extern struct block_device *open_by_devnum(dev_t, fmode_t); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); +extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); extern int fsync_super(struct super_block *); From fcd5e16286024360340eda61da4e2f5ec152b087 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 31 Mar 2009 15:23:47 -0700 Subject: [PATCH 355/478] remove unused include/asm-generic/dma-mapping.h Signed-off-by: FUJITA Tomonori Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/dma-mapping.h | 308 ------------------------------ 1 file changed, 308 deletions(-) delete mode 100644 include/asm-generic/dma-mapping.h diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h deleted file mode 100644 index 189486c3f92e..000000000000 --- a/include/asm-generic/dma-mapping.h +++ /dev/null @@ -1,308 +0,0 @@ -/* Copyright (C) 2002 by James.Bottomley@HansenPartnership.com - * - * Implements the generic device dma API via the existing pci_ one - * for unconverted architectures - */ - -#ifndef _ASM_GENERIC_DMA_MAPPING_H -#define _ASM_GENERIC_DMA_MAPPING_H - - -#ifdef CONFIG_PCI - -/* we implement the API below in terms of the existing PCI one, - * so include it */ -#include -/* need struct page definitions */ -#include - -static inline int -dma_supported(struct device *dev, u64 mask) -{ - BUG_ON(dev->bus != &pci_bus_type); - - return pci_dma_supported(to_pci_dev(dev), mask); -} - -static inline int -dma_set_mask(struct device *dev, u64 dma_mask) -{ - BUG_ON(dev->bus != &pci_bus_type); - - return pci_set_dma_mask(to_pci_dev(dev), dma_mask); -} - -static inline void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flag) -{ - BUG_ON(dev->bus != &pci_bus_type); - - return pci_alloc_consistent(to_pci_dev(dev), size, dma_handle); -} - -static inline void -dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_free_consistent(to_pci_dev(dev), size, cpu_addr, dma_handle); -} - -static inline dma_addr_t -dma_map_single(struct device *dev, void *cpu_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - return pci_map_single(to_pci_dev(dev), cpu_addr, size, (int)direction); -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_unmap_single(to_pci_dev(dev), dma_addr, size, (int)direction); -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - return pci_map_page(to_pci_dev(dev), page, offset, size, (int)direction); -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_unmap_page(to_pci_dev(dev), dma_address, size, (int)direction); -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction); -} - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_unmap_sg(to_pci_dev(dev), sg, nhwentries, (int)direction); -} - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle, - size, (int)direction); -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle, - size, (int)direction); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg, nelems, (int)direction); -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - BUG_ON(dev->bus != &pci_bus_type); - - pci_dma_sync_sg_for_device(to_pci_dev(dev), sg, nelems, (int)direction); -} - -static inline int -dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return pci_dma_mapping_error(to_pci_dev(dev), dma_addr); -} - - -#else - -static inline int -dma_supported(struct device *dev, u64 mask) -{ - return 0; -} - -static inline int -dma_set_mask(struct device *dev, u64 dma_mask) -{ - BUG(); - return 0; -} - -static inline void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flag) -{ - BUG(); - return NULL; -} - -static inline void -dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) -{ - BUG(); -} - -static inline dma_addr_t -dma_map_single(struct device *dev, void *cpu_addr, size_t size, - enum dma_data_direction direction) -{ - BUG(); - return 0; -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - BUG(); - return 0; -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - BUG(); - return 0; -} - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - BUG(); -} - -static inline int -dma_error(dma_addr_t dma_addr) -{ - return 0; -} - -#endif - -/* Now for the API extensions over the pci_ one */ - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) -#define dma_is_consistent(d, h) (1) - -static inline int -dma_get_cache_alignment(void) -{ - /* no easy way to get cache size on all processors, so return - * the maximum possible, to be safe */ - return (1 << INTERNODE_CACHE_SHIFT); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything, that's all the pci API can do */ - dma_sync_single_for_cpu(dev, dma_handle, offset+size, direction); -} - -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything, that's all the pci API can do */ - dma_sync_single_for_device(dev, dma_handle, offset+size, direction); -} - -static inline void -dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction direction) -{ - /* could define this in terms of the dma_cache ... operations, - * but if you get this on a platform, you should convert the platform - * to using the generic device DMA API */ - BUG(); -} - -#endif - From c0aa24ba8962bd1db98938fb2f3773f870896036 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 31 Mar 2009 15:23:48 -0700 Subject: [PATCH 356/478] auxdisplay: remove PARPORT dependency Remove PARPORT dependency for Auxiliary Display support. This is not needed since the dependency for the KS0108 driver is PARPORT_PC. Signed-off-by: H Hartley Sweeten Cc: Miguel Ojeda Sandonis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/auxdisplay/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 14b9d5f4c203..c07e725ea93d 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -6,7 +6,6 @@ # menuconfig AUXDISPLAY - depends on PARPORT bool "Auxiliary Display support" ---help--- Say Y here to get to see options for auxiliary display drivers. @@ -14,7 +13,7 @@ menuconfig AUXDISPLAY If you say N, all options in this submenu will be skipped and disabled. -if AUXDISPLAY && PARPORT +if AUXDISPLAY config KS0108 tristate "KS0108 LCD Controller" From e0f7ad5f4f5056b20914a35d31abcf29036ca364 Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Tue, 31 Mar 2009 15:23:49 -0700 Subject: [PATCH 357/478] bcm47xx: fix GPIO API return codes The GPIO API is supposed to return 0 or a negative error code, but the SSB GPIO functions return the bitmask of the GPIO register. Fix this by ignoring the bitmask and always returning 0. The SSB GPIO functions can't fail. Signed-off-by: Michael Buesch Cc: Ralf Baechle Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/mach-bcm47xx/gpio.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/mips/include/asm/mach-bcm47xx/gpio.h b/arch/mips/include/asm/mach-bcm47xx/gpio.h index d8ff4cd89ab5..1784fde2e28f 100644 --- a/arch/mips/include/asm/mach-bcm47xx/gpio.h +++ b/arch/mips/include/asm/mach-bcm47xx/gpio.h @@ -31,24 +31,28 @@ static inline void gpio_set_value(unsigned gpio, int value) static inline int gpio_direction_input(unsigned gpio) { - return ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 0); + ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 0); + return 0; } static inline int gpio_direction_output(unsigned gpio, int value) { - return ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 1 << gpio); + ssb_gpio_outen(&ssb_bcm47xx, 1 << gpio, 1 << gpio); + return 0; } -static int gpio_intmask(unsigned gpio, int value) +static inline int gpio_intmask(unsigned gpio, int value) { - return ssb_gpio_intmask(&ssb_bcm47xx, 1 << gpio, - value ? 1 << gpio : 0); + ssb_gpio_intmask(&ssb_bcm47xx, 1 << gpio, + value ? 1 << gpio : 0); + return 0; } -static int gpio_polarity(unsigned gpio, int value) +static inline int gpio_polarity(unsigned gpio, int value) { - return ssb_gpio_polarity(&ssb_bcm47xx, 1 << gpio, - value ? 1 << gpio : 0); + ssb_gpio_polarity(&ssb_bcm47xx, 1 << gpio, + value ? 1 << gpio : 0); + return 0; } From acdd052a285a7b4cc7da4fa7d34ef9fd0a67b2f8 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Tue, 31 Mar 2009 15:23:50 -0700 Subject: [PATCH 358/478] init/main.c: fix sparse warnings: context imbalance Impact: Attribute function 'init_post' with __releases(...). Fix these sparse warnings: init/main.c:805:21: warning: context imbalance in 'init_post' - unexpected unlock init/main.c:899:9: warning: context imbalance in 'kernel_init' - wrong count at exit Signed-off-by: Hannes Eder Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/init/main.c b/init/main.c index d6b388fbffa6..07c8658ffca5 100644 --- a/init/main.c +++ b/init/main.c @@ -793,6 +793,7 @@ static void run_init_process(char *init_filename) * makes it inline to init() and it becomes part of init.text section */ static noinline int init_post(void) + __releases(kernel_lock) { /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); From 311d07611e8b354cc1ee6546e4c574c01111adc8 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 31 Mar 2009 15:23:51 -0700 Subject: [PATCH 359/478] introduce pr_cont() macro We cover all log-levels by pr_... macros except KERN_CONT one. Add it for convenience. Signed-off-by: Cyrill Gorcunov Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f81d80f47dcb..e720b0da7751 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -353,6 +353,8 @@ static inline char *pack_hex_byte(char *buf, u8 byte) printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) +#define pr_cont(fmt, ...) \ + printk(KERN_CONT fmt, ##__VA_ARGS__) /* If you are writing a driver, please use dev_dbg instead */ #if defined(DEBUG) From 63cd885426872254e82dac2d9e13ea4f720c21dc Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 31 Mar 2009 15:23:52 -0700 Subject: [PATCH 360/478] ntfs: remove private wrapper of endian helpers The base versions handle constant folding now and are shorter than these private wrappers, use them directly. Signed-off-by: Harvey Harrison Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/dir.c | 4 +- fs/ntfs/inode.c | 3 +- fs/ntfs/layout.h | 321 ++++++++++++++++++++++------------------------ fs/ntfs/logfile.h | 6 +- fs/ntfs/mft.c | 2 +- fs/ntfs/super.c | 50 ++++---- fs/ntfs/usnjrnl.h | 48 +++---- 7 files changed, 211 insertions(+), 223 deletions(-) diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 34314b33dbd4..5a9e34475e37 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -32,8 +32,8 @@ /** * The little endian Unicode string $I30 as a global constant. */ -ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'), - const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 }; +ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), + cpu_to_le16('3'), cpu_to_le16('0'), 0 }; /** * ntfs_lookup_inode_by_name - find an inode in a directory given its name diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 86bef156cf0a..82c5085559c6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi) goto em_put_err_out; next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > - const_le32_to_cpu(AT_DATA)) + if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) goto em_put_err_out; if (AT_DATA != al_entry->type) continue; diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 1e383328eceb..50931b1ce4b9 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -31,19 +31,8 @@ #include "types.h" -/* - * Constant endianness conversion defines. - */ -#define const_le16_to_cpu(x) __constant_le16_to_cpu(x) -#define const_le32_to_cpu(x) __constant_le32_to_cpu(x) -#define const_le64_to_cpu(x) __constant_le64_to_cpu(x) - -#define const_cpu_to_le16(x) __constant_cpu_to_le16(x) -#define const_cpu_to_le32(x) __constant_cpu_to_le32(x) -#define const_cpu_to_le64(x) __constant_cpu_to_le64(x) - /* The NTFS oem_id "NTFS " */ -#define magicNTFS const_cpu_to_le64(0x202020205346544eULL) +#define magicNTFS cpu_to_le64(0x202020205346544eULL) /* * Location of bootsector on partition: @@ -114,25 +103,25 @@ typedef struct { */ enum { /* Found in $MFT/$DATA. */ - magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */ - magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */ - magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ + magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ + magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ + magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ /* Found in $LogFile/$DATA. */ - magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */ - magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */ + magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ + magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ - magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ + magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ /* Found in all ntfs record containing records. */ - magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector + magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector transfer was detected. */ /* * Found in $LogFile/$DATA when a page is full of 0xff bytes and is * thus not initialized. Page must be initialized before using it. */ - magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */ + magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ }; typedef le32 NTFS_RECORD_TYPE; @@ -258,8 +247,8 @@ typedef enum { * information about the mft record in which they are present. */ enum { - MFT_RECORD_IN_USE = const_cpu_to_le16(0x0001), - MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002), + MFT_RECORD_IN_USE = cpu_to_le16(0x0001), + MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), } __attribute__ ((__packed__)); typedef le16 MFT_RECORD_FLAGS; @@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS; * Note: The _LE versions will return a CPU endian formatted value! */ #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL -#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU) +#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) typedef u64 MFT_REF; typedef le64 leMFT_REF; @@ -477,25 +466,25 @@ typedef struct { * a revealing choice of symbol I do not know what is... (-; */ enum { - AT_UNUSED = const_cpu_to_le32( 0), - AT_STANDARD_INFORMATION = const_cpu_to_le32( 0x10), - AT_ATTRIBUTE_LIST = const_cpu_to_le32( 0x20), - AT_FILE_NAME = const_cpu_to_le32( 0x30), - AT_OBJECT_ID = const_cpu_to_le32( 0x40), - AT_SECURITY_DESCRIPTOR = const_cpu_to_le32( 0x50), - AT_VOLUME_NAME = const_cpu_to_le32( 0x60), - AT_VOLUME_INFORMATION = const_cpu_to_le32( 0x70), - AT_DATA = const_cpu_to_le32( 0x80), - AT_INDEX_ROOT = const_cpu_to_le32( 0x90), - AT_INDEX_ALLOCATION = const_cpu_to_le32( 0xa0), - AT_BITMAP = const_cpu_to_le32( 0xb0), - AT_REPARSE_POINT = const_cpu_to_le32( 0xc0), - AT_EA_INFORMATION = const_cpu_to_le32( 0xd0), - AT_EA = const_cpu_to_le32( 0xe0), - AT_PROPERTY_SET = const_cpu_to_le32( 0xf0), - AT_LOGGED_UTILITY_STREAM = const_cpu_to_le32( 0x100), - AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32( 0x1000), - AT_END = const_cpu_to_le32(0xffffffff) + AT_UNUSED = cpu_to_le32( 0), + AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), + AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), + AT_FILE_NAME = cpu_to_le32( 0x30), + AT_OBJECT_ID = cpu_to_le32( 0x40), + AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), + AT_VOLUME_NAME = cpu_to_le32( 0x60), + AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), + AT_DATA = cpu_to_le32( 0x80), + AT_INDEX_ROOT = cpu_to_le32( 0x90), + AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), + AT_BITMAP = cpu_to_le32( 0xb0), + AT_REPARSE_POINT = cpu_to_le32( 0xc0), + AT_EA_INFORMATION = cpu_to_le32( 0xd0), + AT_EA = cpu_to_le32( 0xe0), + AT_PROPERTY_SET = cpu_to_le32( 0xf0), + AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), + AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), + AT_END = cpu_to_le32(0xffffffff) }; typedef le32 ATTR_TYPE; @@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE; * equal then the second le32 values would be compared, etc. */ enum { - COLLATION_BINARY = const_cpu_to_le32(0x00), - COLLATION_FILE_NAME = const_cpu_to_le32(0x01), - COLLATION_UNICODE_STRING = const_cpu_to_le32(0x02), - COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10), - COLLATION_NTOFS_SID = const_cpu_to_le32(0x11), - COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12), - COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13), + COLLATION_BINARY = cpu_to_le32(0x00), + COLLATION_FILE_NAME = cpu_to_le32(0x01), + COLLATION_UNICODE_STRING = cpu_to_le32(0x02), + COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), + COLLATION_NTOFS_SID = cpu_to_le32(0x11), + COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), + COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), }; typedef le32 COLLATION_RULE; @@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE; * NT4. */ enum { - ATTR_DEF_INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be + ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be indexed. */ - ATTR_DEF_MULTIPLE = const_cpu_to_le32(0x04), /* Attribute type + ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type can be present multiple times in the mft records of an inode. */ - ATTR_DEF_NOT_ZERO = const_cpu_to_le32(0x08), /* Attribute value + ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value must contain at least one non-zero byte. */ - ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be + ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be indexed and the attribute value must be unique for the attribute type in all of the mft records of an inode. */ - ATTR_DEF_NAMED_UNIQUE = const_cpu_to_le32(0x20), /* Attribute must be + ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be named and the name must be unique for the attribute type in all of the mft records of an inode. */ - ATTR_DEF_RESIDENT = const_cpu_to_le32(0x40), /* Attribute must be + ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be resident. */ - ATTR_DEF_ALWAYS_LOG = const_cpu_to_le32(0x80), /* Always log + ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log modifications to this attribute, regardless of whether it is resident or non-resident. Without this, only log @@ -614,12 +603,12 @@ typedef struct { * Attribute flags (16-bit). */ enum { - ATTR_IS_COMPRESSED = const_cpu_to_le16(0x0001), - ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method + ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), + ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method mask. Also, first illegal value. */ - ATTR_IS_ENCRYPTED = const_cpu_to_le16(0x4000), - ATTR_IS_SPARSE = const_cpu_to_le16(0x8000), + ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), + ATTR_IS_SPARSE = cpu_to_le16(0x8000), } __attribute__ ((__packed__)); typedef le16 ATTR_FLAGS; @@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC; * flags appear in all of the above. */ enum { - FILE_ATTR_READONLY = const_cpu_to_le32(0x00000001), - FILE_ATTR_HIDDEN = const_cpu_to_le32(0x00000002), - FILE_ATTR_SYSTEM = const_cpu_to_le32(0x00000004), - /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */ + FILE_ATTR_READONLY = cpu_to_le32(0x00000001), + FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), + /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ - FILE_ATTR_DIRECTORY = const_cpu_to_le32(0x00000010), + FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is reserved for the DOS SUBDIRECTORY flag. */ - FILE_ATTR_ARCHIVE = const_cpu_to_le32(0x00000020), - FILE_ATTR_DEVICE = const_cpu_to_le32(0x00000040), - FILE_ATTR_NORMAL = const_cpu_to_le32(0x00000080), + FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), - FILE_ATTR_TEMPORARY = const_cpu_to_le32(0x00000100), - FILE_ATTR_SPARSE_FILE = const_cpu_to_le32(0x00000200), - FILE_ATTR_REPARSE_POINT = const_cpu_to_le32(0x00000400), - FILE_ATTR_COMPRESSED = const_cpu_to_le32(0x00000800), + FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), - FILE_ATTR_OFFLINE = const_cpu_to_le32(0x00001000), - FILE_ATTR_NOT_CONTENT_INDEXED = const_cpu_to_le32(0x00002000), - FILE_ATTR_ENCRYPTED = const_cpu_to_le32(0x00004000), + FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), - FILE_ATTR_VALID_FLAGS = const_cpu_to_le32(0x00007fb7), + FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the FILE_ATTR_DEVICE and preserves everything else. This mask is used to obtain all flags that are valid for reading. */ - FILE_ATTR_VALID_SET_FLAGS = const_cpu_to_le32(0x000031a7), + FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask @@ -846,11 +835,11 @@ enum { * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION * attribute of an mft record. */ - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this is a directory or not, i.e. whether it has an index root attribute or not. */ - FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), + FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this file has a view index present (eg. object id index, quota index, one of the security indexes or the encrypting @@ -1446,42 +1435,42 @@ enum { /* Specific rights for files and directories are as follows: */ /* Right to read data from the file. (FILE) */ - FILE_READ_DATA = const_cpu_to_le32(0x00000001), + FILE_READ_DATA = cpu_to_le32(0x00000001), /* Right to list contents of a directory. (DIRECTORY) */ - FILE_LIST_DIRECTORY = const_cpu_to_le32(0x00000001), + FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), /* Right to write data to the file. (FILE) */ - FILE_WRITE_DATA = const_cpu_to_le32(0x00000002), + FILE_WRITE_DATA = cpu_to_le32(0x00000002), /* Right to create a file in the directory. (DIRECTORY) */ - FILE_ADD_FILE = const_cpu_to_le32(0x00000002), + FILE_ADD_FILE = cpu_to_le32(0x00000002), /* Right to append data to the file. (FILE) */ - FILE_APPEND_DATA = const_cpu_to_le32(0x00000004), + FILE_APPEND_DATA = cpu_to_le32(0x00000004), /* Right to create a subdirectory. (DIRECTORY) */ - FILE_ADD_SUBDIRECTORY = const_cpu_to_le32(0x00000004), + FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), /* Right to read extended attributes. (FILE/DIRECTORY) */ - FILE_READ_EA = const_cpu_to_le32(0x00000008), + FILE_READ_EA = cpu_to_le32(0x00000008), /* Right to write extended attributes. (FILE/DIRECTORY) */ - FILE_WRITE_EA = const_cpu_to_le32(0x00000010), + FILE_WRITE_EA = cpu_to_le32(0x00000010), /* Right to execute a file. (FILE) */ - FILE_EXECUTE = const_cpu_to_le32(0x00000020), + FILE_EXECUTE = cpu_to_le32(0x00000020), /* Right to traverse the directory. (DIRECTORY) */ - FILE_TRAVERSE = const_cpu_to_le32(0x00000020), + FILE_TRAVERSE = cpu_to_le32(0x00000020), /* * Right to delete a directory and all the files it contains (its * children), even if the files are read-only. (DIRECTORY) */ - FILE_DELETE_CHILD = const_cpu_to_le32(0x00000040), + FILE_DELETE_CHILD = cpu_to_le32(0x00000040), /* Right to read file attributes. (FILE/DIRECTORY) */ - FILE_READ_ATTRIBUTES = const_cpu_to_le32(0x00000080), + FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), /* Right to change file attributes. (FILE/DIRECTORY) */ - FILE_WRITE_ATTRIBUTES = const_cpu_to_le32(0x00000100), + FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), /* * The standard rights (bits 16 to 23). These are independent of the @@ -1489,27 +1478,27 @@ enum { */ /* Right to delete the object. */ - DELETE = const_cpu_to_le32(0x00010000), + DELETE = cpu_to_le32(0x00010000), /* * Right to read the information in the object's security descriptor, * not including the information in the SACL, i.e. right to read the * security descriptor and owner. */ - READ_CONTROL = const_cpu_to_le32(0x00020000), + READ_CONTROL = cpu_to_le32(0x00020000), /* Right to modify the DACL in the object's security descriptor. */ - WRITE_DAC = const_cpu_to_le32(0x00040000), + WRITE_DAC = cpu_to_le32(0x00040000), /* Right to change the owner in the object's security descriptor. */ - WRITE_OWNER = const_cpu_to_le32(0x00080000), + WRITE_OWNER = cpu_to_le32(0x00080000), /* * Right to use the object for synchronization. Enables a process to * wait until the object is in the signalled state. Some object types * do not support this access right. */ - SYNCHRONIZE = const_cpu_to_le32(0x00100000), + SYNCHRONIZE = cpu_to_le32(0x00100000), /* * The following STANDARD_RIGHTS_* are combinations of the above for @@ -1517,25 +1506,25 @@ enum { */ /* These are currently defined to READ_CONTROL. */ - STANDARD_RIGHTS_READ = const_cpu_to_le32(0x00020000), - STANDARD_RIGHTS_WRITE = const_cpu_to_le32(0x00020000), - STANDARD_RIGHTS_EXECUTE = const_cpu_to_le32(0x00020000), + STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ - STANDARD_RIGHTS_REQUIRED = const_cpu_to_le32(0x000f0000), + STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), /* * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and * SYNCHRONIZE access. */ - STANDARD_RIGHTS_ALL = const_cpu_to_le32(0x001f0000), + STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), /* * The access system ACL and maximum allowed access types (bits 24 to * 25, bits 26 to 27 are reserved). */ - ACCESS_SYSTEM_SECURITY = const_cpu_to_le32(0x01000000), - MAXIMUM_ALLOWED = const_cpu_to_le32(0x02000000), + ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), + MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), /* * The generic rights (bits 28 to 31). These map onto the standard and @@ -1543,10 +1532,10 @@ enum { */ /* Read, write, and execute access. */ - GENERIC_ALL = const_cpu_to_le32(0x10000000), + GENERIC_ALL = cpu_to_le32(0x10000000), /* Execute access. */ - GENERIC_EXECUTE = const_cpu_to_le32(0x20000000), + GENERIC_EXECUTE = cpu_to_le32(0x20000000), /* * Write access. For files, this maps onto: @@ -1555,7 +1544,7 @@ enum { * For directories, the mapping has the same numerical value. See * above for the descriptions of the rights granted. */ - GENERIC_WRITE = const_cpu_to_le32(0x40000000), + GENERIC_WRITE = cpu_to_le32(0x40000000), /* * Read access. For files, this maps onto: @@ -1564,7 +1553,7 @@ enum { * For directories, the mapping has the same numberical value. See * above for the descriptions of the rights granted. */ - GENERIC_READ = const_cpu_to_le32(0x80000000), + GENERIC_READ = cpu_to_le32(0x80000000), }; typedef le32 ACCESS_MASK; @@ -1604,8 +1593,8 @@ typedef struct { * The object ACE flags (32-bit). */ enum { - ACE_OBJECT_TYPE_PRESENT = const_cpu_to_le32(1), - ACE_INHERITED_OBJECT_TYPE_PRESENT = const_cpu_to_le32(2), + ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), + ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), }; typedef le32 OBJECT_ACE_FLAGS; @@ -1706,23 +1695,23 @@ typedef enum { * expressed as offsets from the beginning of the security descriptor. */ enum { - SE_OWNER_DEFAULTED = const_cpu_to_le16(0x0001), - SE_GROUP_DEFAULTED = const_cpu_to_le16(0x0002), - SE_DACL_PRESENT = const_cpu_to_le16(0x0004), - SE_DACL_DEFAULTED = const_cpu_to_le16(0x0008), + SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), + SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), + SE_DACL_PRESENT = cpu_to_le16(0x0004), + SE_DACL_DEFAULTED = cpu_to_le16(0x0008), - SE_SACL_PRESENT = const_cpu_to_le16(0x0010), - SE_SACL_DEFAULTED = const_cpu_to_le16(0x0020), + SE_SACL_PRESENT = cpu_to_le16(0x0010), + SE_SACL_DEFAULTED = cpu_to_le16(0x0020), - SE_DACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0100), - SE_SACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0200), - SE_DACL_AUTO_INHERITED = const_cpu_to_le16(0x0400), - SE_SACL_AUTO_INHERITED = const_cpu_to_le16(0x0800), + SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), + SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), + SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), + SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), - SE_DACL_PROTECTED = const_cpu_to_le16(0x1000), - SE_SACL_PROTECTED = const_cpu_to_le16(0x2000), - SE_RM_CONTROL_VALID = const_cpu_to_le16(0x4000), - SE_SELF_RELATIVE = const_cpu_to_le16(0x8000) + SE_DACL_PROTECTED = cpu_to_le16(0x1000), + SE_SACL_PROTECTED = cpu_to_le16(0x2000), + SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), + SE_SELF_RELATIVE = cpu_to_le16(0x8000) } __attribute__ ((__packed__)); typedef le16 SECURITY_DESCRIPTOR_CONTROL; @@ -1910,21 +1899,21 @@ typedef struct { * Possible flags for the volume (16-bit). */ enum { - VOLUME_IS_DIRTY = const_cpu_to_le16(0x0001), - VOLUME_RESIZE_LOG_FILE = const_cpu_to_le16(0x0002), - VOLUME_UPGRADE_ON_MOUNT = const_cpu_to_le16(0x0004), - VOLUME_MOUNTED_ON_NT4 = const_cpu_to_le16(0x0008), + VOLUME_IS_DIRTY = cpu_to_le16(0x0001), + VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), + VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), + VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), - VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), - VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), + VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), + VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), - VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000), - VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), + VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), + VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), - VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f), + VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027), + VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), } __attribute__ ((__packed__)); typedef le16 VOLUME_FLAGS; @@ -2109,26 +2098,26 @@ typedef struct { * The user quota flags. Names explain meaning. */ enum { - QUOTA_FLAG_DEFAULT_LIMITS = const_cpu_to_le32(0x00000001), - QUOTA_FLAG_LIMIT_REACHED = const_cpu_to_le32(0x00000002), - QUOTA_FLAG_ID_DELETED = const_cpu_to_le32(0x00000004), + QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), + QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), + QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), - QUOTA_FLAG_USER_MASK = const_cpu_to_le32(0x00000007), + QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), /* This is a bit mask for the user quota flags. */ /* * These flags are only present in the quota defaults index entry, i.e. * in the entry where owner_id = QUOTA_DEFAULTS_ID. */ - QUOTA_FLAG_TRACKING_ENABLED = const_cpu_to_le32(0x00000010), - QUOTA_FLAG_ENFORCEMENT_ENABLED = const_cpu_to_le32(0x00000020), - QUOTA_FLAG_TRACKING_REQUESTED = const_cpu_to_le32(0x00000040), - QUOTA_FLAG_LOG_THRESHOLD = const_cpu_to_le32(0x00000080), + QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), + QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), + QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), + QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), - QUOTA_FLAG_LOG_LIMIT = const_cpu_to_le32(0x00000100), - QUOTA_FLAG_OUT_OF_DATE = const_cpu_to_le32(0x00000200), - QUOTA_FLAG_CORRUPT = const_cpu_to_le32(0x00000400), - QUOTA_FLAG_PENDING_DELETES = const_cpu_to_le32(0x00000800), + QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), + QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), + QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), + QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), }; typedef le32 QUOTA_FLAGS; @@ -2172,9 +2161,9 @@ typedef struct { * Predefined owner_id values (32-bit). */ enum { - QUOTA_INVALID_ID = const_cpu_to_le32(0x00000000), - QUOTA_DEFAULTS_ID = const_cpu_to_le32(0x00000001), - QUOTA_FIRST_USER_ID = const_cpu_to_le32(0x00000100), + QUOTA_INVALID_ID = cpu_to_le32(0x00000000), + QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), + QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), }; /* @@ -2189,14 +2178,14 @@ typedef enum { * Index entry flags (16-bit). */ enum { - INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a + INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a sub-node, i.e. a reference to an index block in form of a virtual cluster number (see below). */ - INDEX_ENTRY_END = const_cpu_to_le16(2), /* This signifies the last + INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last entry in an index block. The index entry does not represent a file but it can point to a sub-node. */ - INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force + INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16-bit. */ } __attribute__ ((__packed__)); @@ -2334,26 +2323,26 @@ typedef struct { * These are the predefined reparse point tags: */ enum { - IO_REPARSE_TAG_IS_ALIAS = const_cpu_to_le32(0x20000000), - IO_REPARSE_TAG_IS_HIGH_LATENCY = const_cpu_to_le32(0x40000000), - IO_REPARSE_TAG_IS_MICROSOFT = const_cpu_to_le32(0x80000000), + IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), + IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), - IO_REPARSE_TAG_RESERVED_ZERO = const_cpu_to_le32(0x00000000), - IO_REPARSE_TAG_RESERVED_ONE = const_cpu_to_le32(0x00000001), - IO_REPARSE_TAG_RESERVED_RANGE = const_cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), + IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), - IO_REPARSE_TAG_NSS = const_cpu_to_le32(0x68000005), - IO_REPARSE_TAG_NSS_RECOVER = const_cpu_to_le32(0x68000006), - IO_REPARSE_TAG_SIS = const_cpu_to_le32(0x68000007), - IO_REPARSE_TAG_DFS = const_cpu_to_le32(0x68000008), + IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), + IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), + IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), - IO_REPARSE_TAG_MOUNT_POINT = const_cpu_to_le32(0x88000003), + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), - IO_REPARSE_TAG_HSM = const_cpu_to_le32(0xa8000004), + IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), - IO_REPARSE_TAG_SYMBOLIC_LINK = const_cpu_to_le32(0xe8000000), + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), - IO_REPARSE_TAG_VALID_VALUES = const_cpu_to_le32(0xe000ffff), + IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), }; /* diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h index 9468e1c45ae3..b5a6f08bd35c 100644 --- a/fs/ntfs/logfile.h +++ b/fs/ntfs/logfile.h @@ -104,7 +104,7 @@ typedef struct { * in this particular client array. Also inside the client records themselves, * this means that there are no client records preceding or following this one. */ -#define LOGFILE_NO_CLIENT const_cpu_to_le16(0xffff) +#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) #define LOGFILE_NO_CLIENT_CPU 0xffff /* @@ -112,8 +112,8 @@ typedef struct { * information about the log file in which they are present. */ enum { - RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002), - RESTART_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ + RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), + RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ } __attribute__ ((__packed__)); typedef le16 RESTART_AREA_FLAGS; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 17d32ca6bc35..23bf68453d7d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) */ /* Mark the mft record as not in use. */ - m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE)); + m->flags &= ~MFT_RECORD_IN_USE; /* Increment the sequence number, skipping zero, if it is not zero. */ old_seq_no = m->sequence_number; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 4a46743b5077..f76951dcd4a6 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb, * many BIOSes will refuse to boot from a bootsector if the magic is * incorrect, so we emit a warning. */ - if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55)) + if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) ntfs_warning(sb, "Invalid end of sector marker."); return true; not_ntfs: @@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol) u32 *kaddr, *kend; ntfs_name *name = NULL; int ret = 1; - static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'), - const_cpu_to_le16('i'), const_cpu_to_le16('b'), - const_cpu_to_le16('e'), const_cpu_to_le16('r'), - const_cpu_to_le16('f'), const_cpu_to_le16('i'), - const_cpu_to_le16('l'), const_cpu_to_le16('.'), - const_cpu_to_le16('s'), const_cpu_to_le16('y'), - const_cpu_to_le16('s'), 0 }; + static const ntfschar hiberfil[13] = { cpu_to_le16('h'), + cpu_to_le16('i'), cpu_to_le16('b'), + cpu_to_le16('e'), cpu_to_le16('r'), + cpu_to_le16('f'), cpu_to_le16('i'), + cpu_to_le16('l'), cpu_to_le16('.'), + cpu_to_le16('s'), cpu_to_le16('y'), + cpu_to_le16('s'), 0 }; ntfs_debug("Entering."); /* @@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol) goto iput_out; } kaddr = (u32*)page_address(page); - if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) { + if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " "hibernated on the volume. This is the " "system volume."); @@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol) MFT_REF mref; struct inode *tmp_ino; ntfs_name *name = NULL; - static const ntfschar Quota[7] = { const_cpu_to_le16('$'), - const_cpu_to_le16('Q'), const_cpu_to_le16('u'), - const_cpu_to_le16('o'), const_cpu_to_le16('t'), - const_cpu_to_le16('a'), 0 }; - static ntfschar Q[3] = { const_cpu_to_le16('$'), - const_cpu_to_le16('Q'), 0 }; + static const ntfschar Quota[7] = { cpu_to_le16('$'), + cpu_to_le16('Q'), cpu_to_le16('u'), + cpu_to_le16('o'), cpu_to_le16('t'), + cpu_to_le16('a'), 0 }; + static ntfschar Q[3] = { cpu_to_le16('$'), + cpu_to_le16('Q'), 0 }; ntfs_debug("Entering."); /* @@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) struct page *page; ntfs_name *name = NULL; USN_HEADER *uh; - static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'), - const_cpu_to_le16('U'), const_cpu_to_le16('s'), - const_cpu_to_le16('n'), const_cpu_to_le16('J'), - const_cpu_to_le16('r'), const_cpu_to_le16('n'), - const_cpu_to_le16('l'), 0 }; - static ntfschar Max[5] = { const_cpu_to_le16('$'), - const_cpu_to_le16('M'), const_cpu_to_le16('a'), - const_cpu_to_le16('x'), 0 }; - static ntfschar J[3] = { const_cpu_to_le16('$'), - const_cpu_to_le16('J'), 0 }; + static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), + cpu_to_le16('U'), cpu_to_le16('s'), + cpu_to_le16('n'), cpu_to_le16('J'), + cpu_to_le16('r'), cpu_to_le16('n'), + cpu_to_le16('l'), 0 }; + static ntfschar Max[5] = { cpu_to_le16('$'), + cpu_to_le16('M'), cpu_to_le16('a'), + cpu_to_le16('x'), 0 }; + static ntfschar J[3] = { cpu_to_le16('$'), + cpu_to_le16('J'), 0 }; ntfs_debug("Entering."); /* diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h index 4087fbdac327..00d8e6bd7c36 100644 --- a/fs/ntfs/usnjrnl.h +++ b/fs/ntfs/usnjrnl.h @@ -116,27 +116,27 @@ typedef struct { * documentation: http://www.linux-ntfs.org/ */ enum { - USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), - USN_REASON_DATA_EXTEND = const_cpu_to_le32(0x00000002), - USN_REASON_DATA_TRUNCATION = const_cpu_to_le32(0x00000004), - USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010), - USN_REASON_NAMED_DATA_EXTEND = const_cpu_to_le32(0x00000020), - USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040), - USN_REASON_FILE_CREATE = const_cpu_to_le32(0x00000100), - USN_REASON_FILE_DELETE = const_cpu_to_le32(0x00000200), - USN_REASON_EA_CHANGE = const_cpu_to_le32(0x00000400), - USN_REASON_SECURITY_CHANGE = const_cpu_to_le32(0x00000800), - USN_REASON_RENAME_OLD_NAME = const_cpu_to_le32(0x00001000), - USN_REASON_RENAME_NEW_NAME = const_cpu_to_le32(0x00002000), - USN_REASON_INDEXABLE_CHANGE = const_cpu_to_le32(0x00004000), - USN_REASON_BASIC_INFO_CHANGE = const_cpu_to_le32(0x00008000), - USN_REASON_HARD_LINK_CHANGE = const_cpu_to_le32(0x00010000), - USN_REASON_COMPRESSION_CHANGE = const_cpu_to_le32(0x00020000), - USN_REASON_ENCRYPTION_CHANGE = const_cpu_to_le32(0x00040000), - USN_REASON_OBJECT_ID_CHANGE = const_cpu_to_le32(0x00080000), - USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000), - USN_REASON_STREAM_CHANGE = const_cpu_to_le32(0x00200000), - USN_REASON_CLOSE = const_cpu_to_le32(0x80000000), + USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), + USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), + USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), + USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), + USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), + USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), + USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), + USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), + USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), + USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), + USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), + USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), + USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), + USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), + USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), + USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), + USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), + USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), + USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), + USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), + USN_REASON_CLOSE = cpu_to_le32(0x80000000), }; typedef le32 USN_REASON_FLAGS; @@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS; * http://www.linux-ntfs.org/ */ enum { - USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), - USN_SOURCE_AUXILIARY_DATA = const_cpu_to_le32(0x00000002), - USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004), + USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), + USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), + USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), }; typedef le32 USN_SOURCE_INFO_FLAGS; From 891f7d73ea30f925596b90bcf21020bfc5d90f3f Mon Sep 17 00:00:00 2001 From: David Altobelli Date: Tue, 31 Mar 2009 15:23:53 -0700 Subject: [PATCH 361/478] hpilo: reduce frequency of IO operations Change hpilo open and close logic to spin for 10usec between checking device, rather than every usec. Because the loop is coded to take up to 10ms, it seemed prudent to increase the interval between polling the device, to reduce the load on the system and allow more other work to happen. Signed-off-by: David Altobelli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/hpilo.c | 6 +++--- drivers/misc/hpilo.h | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c index cf991850f01b..880ccf39e23b 100644 --- a/drivers/misc/hpilo.c +++ b/drivers/misc/hpilo.c @@ -209,7 +209,7 @@ static void ilo_ccb_close(struct pci_dev *pdev, struct ccb_data *data) /* give iLO some time to process stop request */ for (retries = MAX_WAIT; retries > 0; retries--) { doorbell_set(driver_ccb); - udelay(1); + udelay(WAIT_TIME); if (!(ioread32(&device_ccb->send_ctrl) & (1 << CTRL_BITPOS_A)) && !(ioread32(&device_ccb->recv_ctrl) & (1 << CTRL_BITPOS_A))) @@ -312,7 +312,7 @@ static int ilo_ccb_open(struct ilo_hwinfo *hw, struct ccb_data *data, int slot) for (i = MAX_WAIT; i > 0; i--) { if (ilo_pkt_dequeue(hw, driver_ccb, SENDQ, &pkt_id, NULL, NULL)) break; - udelay(1); + udelay(WAIT_TIME); } if (i) { @@ -759,7 +759,7 @@ static void __exit ilo_exit(void) class_destroy(ilo_class); } -MODULE_VERSION("1.0"); +MODULE_VERSION("1.1"); MODULE_ALIAS(ILO_NAME); MODULE_DESCRIPTION(ILO_NAME); MODULE_AUTHOR("David Altobelli "); diff --git a/drivers/misc/hpilo.h b/drivers/misc/hpilo.h index b64a20ef07e3..03a14c82aad9 100644 --- a/drivers/misc/hpilo.h +++ b/drivers/misc/hpilo.h @@ -19,8 +19,12 @@ #define MAX_ILO_DEV 1 /* max number of files */ #define MAX_OPEN (MAX_CCB * MAX_ILO_DEV) +/* total wait time in usec */ +#define MAX_WAIT_TIME 10000 +/* per spin wait time in usec */ +#define WAIT_TIME 10 /* spin counter for open/close delay */ -#define MAX_WAIT 10000 +#define MAX_WAIT (MAX_WAIT_TIME / WAIT_TIME) /* * Per device, used to track global memory allocations. From 3cdbbeebb77348176bd6a03fd86e11bc281c529e Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 31 Mar 2009 15:23:53 -0700 Subject: [PATCH 362/478] drivers/misc/isl29003.c: driver for the ISL29003 ambient light sensor Add a driver for Intersil's ISL29003 ambient light sensor device plus some documentation. Inspired by tsl2550.c, a driver for a similar device. It is put in drivers/misc for now until the industrial I/O framework gets merged. Signed-off-by: Daniel Mack Acked-by: Jonathan Cameron Cc: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/misc-devices/isl29003 | 62 ++++ drivers/misc/Kconfig | 10 + drivers/misc/Makefile | 1 + drivers/misc/isl29003.c | 470 ++++++++++++++++++++++++++++ 4 files changed, 543 insertions(+) create mode 100644 Documentation/misc-devices/isl29003 create mode 100644 drivers/misc/isl29003.c diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003 new file mode 100644 index 000000000000..c4ff5f38e010 --- /dev/null +++ b/Documentation/misc-devices/isl29003 @@ -0,0 +1,62 @@ +Kernel driver isl29003 +===================== + +Supported chips: +* Intersil ISL29003 +Prefix: 'isl29003' +Addresses scanned: none +Datasheet: +http://www.intersil.com/data/fn/fn7464.pdf + +Author: Daniel Mack + + +Description +----------- +The ISL29003 is an integrated light sensor with a 16-bit integrating type +ADC, I2C user programmable lux range select for optimized counts/lux, and +I2C multi-function control and monitoring capabilities. The internal ADC +provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by +artificial light sources. + +The driver allows to set the lux range, the bit resolution, the operational +mode (see below) and the power state of device and can read the current lux +value, of course. + + +Detection +--------- + +The ISL29003 does not have an ID register which could be used to identify +it, so the detection routine will just try to read from the configured I2C +addess and consider the device to be present as soon as it ACKs the +transfer. + + +Sysfs entries +------------- + +range: + 0: 0 lux to 1000 lux (default) + 1: 0 lux to 4000 lux + 2: 0 lux to 16,000 lux + 3: 0 lux to 64,000 lux + +resolution: + 0: 2^16 cycles (default) + 1: 2^12 cycles + 2: 2^8 cycles + 3: 2^4 cycles + +mode: + 0: diode1's current (unsigned 16bit) (default) + 1: diode1's current (unsigned 16bit) + 2: difference between diodes (l1 - l2, signed 15bit) + +power_state: + 0: device is disabled (default) + 1: device is enabled + +lux (read only): + returns the value from the last sensor reading + diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 1c484084ed4f..5f3bff434621 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -223,6 +223,16 @@ config DELL_LAPTOP This driver adds support for rfkill and backlight control to Dell laptops. +config ISL29003 + tristate "Intersil ISL29003 ambient light sensor" + depends on I2C && SYSFS + help + If you say yes here you get support for the Intersil ISL29003 + ambient light sensor. + + This driver can also be built as a module. If so, the module + will be called isl29003. + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index bc1199830554..7871f05dcb9b 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_KGDB_TESTS) += kgdbts.o obj-$(CONFIG_SGI_XP) += sgi-xp/ obj-$(CONFIG_SGI_GRU) += sgi-gru/ obj-$(CONFIG_HP_ILO) += hpilo.o +obj-$(CONFIG_ISL29003) += isl29003.o obj-$(CONFIG_C2PORT) += c2port/ obj-y += eeprom/ diff --git a/drivers/misc/isl29003.c b/drivers/misc/isl29003.c new file mode 100644 index 000000000000..2e2a5923d4c2 --- /dev/null +++ b/drivers/misc/isl29003.c @@ -0,0 +1,470 @@ +/* + * isl29003.c - Linux kernel module for + * Intersil ISL29003 ambient light sensor + * + * See file:Documentation/misc-devices/isl29003 + * + * Copyright (c) 2009 Daniel Mack + * + * Based on code written by + * Rodolfo Giometti + * Eurotech S.p.A. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +#define ISL29003_DRV_NAME "isl29003" +#define DRIVER_VERSION "1.0" + +#define ISL29003_REG_COMMAND 0x00 +#define ISL29003_ADC_ENABLED (1 << 7) +#define ISL29003_ADC_PD (1 << 6) +#define ISL29003_TIMING_INT (1 << 5) +#define ISL29003_MODE_SHIFT (2) +#define ISL29003_MODE_MASK (0x3 << ISL29003_MODE_SHIFT) +#define ISL29003_RES_SHIFT (0) +#define ISL29003_RES_MASK (0x3 << ISL29003_RES_SHIFT) + +#define ISL29003_REG_CONTROL 0x01 +#define ISL29003_INT_FLG (1 << 5) +#define ISL29003_RANGE_SHIFT (2) +#define ISL29003_RANGE_MASK (0x3 << ISL29003_RANGE_SHIFT) +#define ISL29003_INT_PERSISTS_SHIFT (0) +#define ISL29003_INT_PERSISTS_MASK (0xf << ISL29003_INT_PERSISTS_SHIFT) + +#define ISL29003_REG_IRQ_THRESH_HI 0x02 +#define ISL29003_REG_IRQ_THRESH_LO 0x03 +#define ISL29003_REG_LSB_SENSOR 0x04 +#define ISL29003_REG_MSB_SENSOR 0x05 +#define ISL29003_REG_LSB_TIMER 0x06 +#define ISL29003_REG_MSB_TIMER 0x07 + +#define ISL29003_NUM_CACHABLE_REGS 4 + +struct isl29003_data { + struct i2c_client *client; + struct mutex lock; + u8 reg_cache[ISL29003_NUM_CACHABLE_REGS]; +}; + +static int gain_range[] = { + 1000, 4000, 16000, 64000 +}; + +/* + * register access helpers + */ + +static int __isl29003_read_reg(struct i2c_client *client, + u32 reg, u8 mask, u8 shift) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + return (data->reg_cache[reg] & mask) >> shift; +} + +static int __isl29003_write_reg(struct i2c_client *client, + u32 reg, u8 mask, u8 shift, u8 val) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + int ret = 0; + u8 tmp; + + if (reg >= ISL29003_NUM_CACHABLE_REGS) + return -EINVAL; + + mutex_lock(&data->lock); + + tmp = data->reg_cache[reg]; + tmp &= ~mask; + tmp |= val << shift; + + ret = i2c_smbus_write_byte_data(client, reg, tmp); + if (!ret) + data->reg_cache[reg] = tmp; + + mutex_unlock(&data->lock); + return ret; +} + +/* + * internally used functions + */ + +/* range */ +static int isl29003_get_range(struct i2c_client *client) +{ + return __isl29003_read_reg(client, ISL29003_REG_CONTROL, + ISL29003_RANGE_MASK, ISL29003_RANGE_SHIFT); +} + +static int isl29003_set_range(struct i2c_client *client, int range) +{ + return __isl29003_write_reg(client, ISL29003_REG_CONTROL, + ISL29003_RANGE_MASK, ISL29003_RANGE_SHIFT, range); +} + +/* resolution */ +static int isl29003_get_resolution(struct i2c_client *client) +{ + return __isl29003_read_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT); +} + +static int isl29003_set_resolution(struct i2c_client *client, int res) +{ + return __isl29003_write_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT, res); +} + +/* mode */ +static int isl29003_get_mode(struct i2c_client *client) +{ + return __isl29003_read_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT); +} + +static int isl29003_set_mode(struct i2c_client *client, int mode) +{ + return __isl29003_write_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT, mode); +} + +/* power_state */ +static int isl29003_set_power_state(struct i2c_client *client, int state) +{ + return __isl29003_write_reg(client, ISL29003_REG_COMMAND, + ISL29003_ADC_ENABLED | ISL29003_ADC_PD, 0, + state ? ISL29003_ADC_ENABLED : ISL29003_ADC_PD); +} + +static int isl29003_get_power_state(struct i2c_client *client) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + u8 cmdreg = data->reg_cache[ISL29003_REG_COMMAND]; + return ~cmdreg & ISL29003_ADC_PD; +} + +static int isl29003_get_adc_value(struct i2c_client *client) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + int lsb, msb, range, bitdepth; + + mutex_lock(&data->lock); + lsb = i2c_smbus_read_byte_data(client, ISL29003_REG_LSB_SENSOR); + + if (lsb < 0) { + mutex_unlock(&data->lock); + return lsb; + } + + msb = i2c_smbus_read_byte_data(client, ISL29003_REG_MSB_SENSOR); + mutex_unlock(&data->lock); + + if (msb < 0) + return msb; + + range = isl29003_get_range(client); + bitdepth = (4 - isl29003_get_resolution(client)) * 4; + return (((msb << 8) | lsb) * gain_range[range]) >> bitdepth; +} + +/* + * sysfs layer + */ + +/* range */ +static ssize_t isl29003_show_range(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%i\n", isl29003_get_range(client)); +} + +static ssize_t isl29003_store_range(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3)) + return -EINVAL; + + ret = isl29003_set_range(client, val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR(range, S_IWUSR | S_IRUGO, + isl29003_show_range, isl29003_store_range); + + +/* resolution */ +static ssize_t isl29003_show_resolution(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%d\n", isl29003_get_resolution(client)); +} + +static ssize_t isl29003_store_resolution(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3)) + return -EINVAL; + + ret = isl29003_set_resolution(client, val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR(resolution, S_IWUSR | S_IRUGO, + isl29003_show_resolution, isl29003_store_resolution); + +/* mode */ +static ssize_t isl29003_show_mode(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%d\n", isl29003_get_mode(client)); +} + +static ssize_t isl29003_store_mode(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 2)) + return -EINVAL; + + ret = isl29003_set_mode(client, val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, + isl29003_show_mode, isl29003_store_mode); + + +/* power state */ +static ssize_t isl29003_show_power_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%d\n", isl29003_get_power_state(client)); +} + +static ssize_t isl29003_store_power_state(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 1)) + return -EINVAL; + + ret = isl29003_set_power_state(client, val); + return ret ? ret : count; +} + +static DEVICE_ATTR(power_state, S_IWUSR | S_IRUGO, + isl29003_show_power_state, isl29003_store_power_state); + + +/* lux */ +static ssize_t isl29003_show_lux(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + + /* No LUX data if not operational */ + if (!isl29003_get_power_state(client)) + return -EBUSY; + + return sprintf(buf, "%d\n", isl29003_get_adc_value(client)); +} + +static DEVICE_ATTR(lux, S_IRUGO, isl29003_show_lux, NULL); + +static struct attribute *isl29003_attributes[] = { + &dev_attr_range.attr, + &dev_attr_resolution.attr, + &dev_attr_mode.attr, + &dev_attr_power_state.attr, + &dev_attr_lux.attr, + NULL +}; + +static const struct attribute_group isl29003_attr_group = { + .attrs = isl29003_attributes, +}; + +static int isl29003_init_client(struct i2c_client *client) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + int i; + + /* read all the registers once to fill the cache. + * if one of the reads fails, we consider the init failed */ + for (i = 0; i < ARRAY_SIZE(data->reg_cache); i++) { + int v = i2c_smbus_read_byte_data(client, i); + if (v < 0) + return -ENODEV; + + data->reg_cache[i] = v; + } + + /* set defaults */ + isl29003_set_range(client, 0); + isl29003_set_resolution(client, 0); + isl29003_set_mode(client, 0); + isl29003_set_power_state(client, 0); + + return 0; +} + +/* + * I2C layer + */ + +static int __devinit isl29003_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent); + struct isl29003_data *data; + int err = 0; + + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE)) + return -EIO; + + data = kzalloc(sizeof(struct isl29003_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->client = client; + i2c_set_clientdata(client, data); + mutex_init(&data->lock); + + /* initialize the ISL29003 chip */ + err = isl29003_init_client(client); + if (err) + goto exit_kfree; + + /* register sysfs hooks */ + err = sysfs_create_group(&client->dev.kobj, &isl29003_attr_group); + if (err) + goto exit_kfree; + + dev_info(&client->dev, "driver version %s enabled\n", DRIVER_VERSION); + return 0; + +exit_kfree: + kfree(data); + return err; +} + +static int __devexit isl29003_remove(struct i2c_client *client) +{ + sysfs_remove_group(&client->dev.kobj, &isl29003_attr_group); + isl29003_set_power_state(client, 0); + kfree(i2c_get_clientdata(client)); + return 0; +} + +#ifdef CONFIG_PM +static int isl29003_suspend(struct i2c_client *client, pm_message_t mesg) +{ + return isl29003_set_power_state(client, 0); +} + +static int isl29003_resume(struct i2c_client *client) +{ + int i; + struct isl29003_data *data = i2c_get_clientdata(client); + + /* restore registers from cache */ + for (i = 0; i < ARRAY_SIZE(data->reg_cache); i++) + if (!i2c_smbus_write_byte_data(client, i, data->reg_cache[i])) + return -EIO; + + return 0; +} + +#else +#define isl29003_suspend NULL +#define isl29003_resume NULL +#endif /* CONFIG_PM */ + +static const struct i2c_device_id isl29003_id[] = { + { "isl29003", 0 }, + {} +}; +MODULE_DEVICE_TABLE(i2c, isl29003_id); + +static struct i2c_driver isl29003_driver = { + .driver = { + .name = ISL29003_DRV_NAME, + .owner = THIS_MODULE, + }, + .suspend = isl29003_suspend, + .resume = isl29003_resume, + .probe = isl29003_probe, + .remove = __devexit_p(isl29003_remove), + .id_table = isl29003_id, +}; + +static int __init isl29003_init(void) +{ + return i2c_add_driver(&isl29003_driver); +} + +static void __exit isl29003_exit(void) +{ + i2c_del_driver(&isl29003_driver); +} + +MODULE_AUTHOR("Daniel Mack "); +MODULE_DESCRIPTION("ISL29003 ambient light sensor driver"); +MODULE_LICENSE("GPL v2"); +MODULE_VERSION(DRIVER_VERSION); + +module_init(isl29003_init); +module_exit(isl29003_exit); + From 5071f97ec6d74f006072de0ce89b67c8792fe5a1 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:10 -0700 Subject: [PATCH 363/478] epoll: fix epoll's own poll Fix a bug inside the epoll's f_op->poll() code, that returns POLLIN even though there are no actual ready monitored fds. The bug shows up if you add an epoll fd inside another fd container (poll, select, epoll). The problem is that callback-based wake ups used by epoll does not carry (patches will follow, to fix this) any information about the events that actually happened. So the callback code, since it can't call the file* ->poll() inside the callback, chains the file* into a ready-list. So, suppose you added an fd with EPOLLOUT only, and some data shows up on the fd, the file* mapped by the fd will be added into the ready-list (via wakeup callback). During normal epoll_wait() use, this condition is sorted out at the time we're actually able to call the file*'s f_op->poll(). Inside the old epoll's f_op->poll() though, only a quick check !list_empty(ready-list) was performed, and this could have led to reporting POLLIN even though no ready fds would show up at a following epoll_wait(). In order to correctly report the ready status for an epoll fd, the ready-list must be checked to see if any really available fd+event would be ready in a following epoll_wait(). Operation (calling f_op->poll() from inside f_op->poll()) that, like wake ups, must be handled with care because of the fact that epoll fds can be added to other epoll fds. Test code: /* * epoll_test by Davide Libenzi (Simple code to test epoll internals) * Copyright (C) 2008 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Davide Libenzi * */ #include #include #include #include #include #include #include #include #include #include #include #define EPWAIT_TIMEO (1 * 1000) #ifndef POLLRDHUP #define POLLRDHUP 0x2000 #endif #define EPOLL_MAX_CHAIN 100L #define EPOLL_TF_LOOP (1 << 0) struct epoll_test_cfg { long size; long flags; }; static int xepoll_create(int n) { int epfd; if ((epfd = epoll_create(n)) == -1) { perror("epoll_create"); exit(2); } return epfd; } static void xepoll_ctl(int epfd, int cmd, int fd, struct epoll_event *evt) { if (epoll_ctl(epfd, cmd, fd, evt) < 0) { perror("epoll_ctl"); exit(3); } } static void xpipe(int *fds) { if (pipe(fds)) { perror("pipe"); exit(4); } } static pid_t xfork(void) { pid_t pid; if ((pid = fork()) == (pid_t) -1) { perror("pipe"); exit(5); } return pid; } static int run_forked_proc(int (*proc)(void *), void *data) { int status; pid_t pid; if ((pid = xfork()) == 0) exit((*proc)(data)); if (waitpid(pid, &status, 0) != pid) { perror("waitpid"); return -1; } return WIFEXITED(status) ? WEXITSTATUS(status): -2; } static int check_events(int fd, int timeo) { struct pollfd pfd; fprintf(stdout, "Checking events for fd %d\n", fd); memset(&pfd, 0, sizeof(pfd)); pfd.fd = fd; pfd.events = POLLIN | POLLOUT; if (poll(&pfd, 1, timeo) < 0) { perror("poll()"); return 0; } if (pfd.revents & POLLIN) fprintf(stdout, "\tPOLLIN\n"); if (pfd.revents & POLLOUT) fprintf(stdout, "\tPOLLOUT\n"); if (pfd.revents & POLLERR) fprintf(stdout, "\tPOLLERR\n"); if (pfd.revents & POLLHUP) fprintf(stdout, "\tPOLLHUP\n"); if (pfd.revents & POLLRDHUP) fprintf(stdout, "\tPOLLRDHUP\n"); return pfd.revents; } static int epoll_test_tty(void *data) { int epfd, ifd = fileno(stdin), res; struct epoll_event evt; if (check_events(ifd, 0) != POLLOUT) { fprintf(stderr, "Something is cooking on STDIN (%d)\n", ifd); return 1; } epfd = xepoll_create(1); fprintf(stdout, "Created epoll fd (%d)\n", epfd); memset(&evt, 0, sizeof(evt)); evt.events = EPOLLIN; xepoll_ctl(epfd, EPOLL_CTL_ADD, ifd, &evt); if (check_events(epfd, 0) & POLLIN) { res = epoll_wait(epfd, &evt, 1, 0); if (res == 0) { fprintf(stderr, "Epoll fd (%d) is ready when it shouldn't!\n", epfd); return 2; } } return 0; } static int epoll_wakeup_chain(void *data) { struct epoll_test_cfg *tcfg = data; int i, res, epfd, bfd, nfd, pfds[2]; pid_t pid; struct epoll_event evt; memset(&evt, 0, sizeof(evt)); evt.events = EPOLLIN; epfd = bfd = xepoll_create(1); for (i = 0; i < tcfg->size; i++) { nfd = xepoll_create(1); xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt); bfd = nfd; } xpipe(pfds); if (tcfg->flags & EPOLL_TF_LOOP) { xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt); /* * If we're testing for loop, we want that the wakeup * triggered by the write to the pipe done in the child * process, triggers a fake event. So we add the pipe * read size with EPOLLOUT events. This will trigger * an addition to the ready-list, but no real events * will be there. The the epoll kernel code will proceed * in calling f_op->poll() of the epfd, triggering the * loop we want to test. */ evt.events = EPOLLOUT; } xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt); /* * The pipe write must come after the poll(2) call inside * check_events(). This tests the nested wakeup code in * fs/eventpoll.c:ep_poll_safewake() * By having the check_events() (hence poll(2)) happens first, * we have poll wait queue filled up, and the write(2) in the * child will trigger the wakeup chain. */ if ((pid = xfork()) == 0) { sleep(1); write(pfds[1], "w", 1); exit(0); } res = check_events(epfd, 2000) & POLLIN; if (waitpid(pid, NULL, 0) != pid) { perror("waitpid"); return -1; } return res; } static int epoll_poll_chain(void *data) { struct epoll_test_cfg *tcfg = data; int i, res, epfd, bfd, nfd, pfds[2]; pid_t pid; struct epoll_event evt; memset(&evt, 0, sizeof(evt)); evt.events = EPOLLIN; epfd = bfd = xepoll_create(1); for (i = 0; i < tcfg->size; i++) { nfd = xepoll_create(1); xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt); bfd = nfd; } xpipe(pfds); if (tcfg->flags & EPOLL_TF_LOOP) { xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt); /* * If we're testing for loop, we want that the wakeup * triggered by the write to the pipe done in the child * process, triggers a fake event. So we add the pipe * read size with EPOLLOUT events. This will trigger * an addition to the ready-list, but no real events * will be there. The the epoll kernel code will proceed * in calling f_op->poll() of the epfd, triggering the * loop we want to test. */ evt.events = EPOLLOUT; } xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt); /* * The pipe write mush come before the poll(2) call inside * check_events(). This tests the nested f_op->poll calls code in * fs/eventpoll.c:ep_eventpoll_poll() * By having the pipe write(2) happen first, we make the kernel * epoll code to load the ready lists, and the following poll(2) * done inside check_events() will test nested poll code in * ep_eventpoll_poll(). */ if ((pid = xfork()) == 0) { write(pfds[1], "w", 1); exit(0); } sleep(1); res = check_events(epfd, 1000) & POLLIN; if (waitpid(pid, NULL, 0) != pid) { perror("waitpid"); return -1; } return res; } int main(int ac, char **av) { int error; struct epoll_test_cfg tcfg; fprintf(stdout, "\n********** Testing TTY events\n"); error = run_forked_proc(epoll_test_tty, NULL); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = 0; fprintf(stdout, "\n********** Testing short wakeup chain\n"); error = run_forked_proc(epoll_wakeup_chain, &tcfg); fprintf(stdout, error == POLLIN ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = EPOLL_MAX_CHAIN; tcfg.flags = 0; fprintf(stdout, "\n********** Testing long wakeup chain (HOLD ON)\n"); error = run_forked_proc(epoll_wakeup_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = 0; fprintf(stdout, "\n********** Testing short poll chain\n"); error = run_forked_proc(epoll_poll_chain, &tcfg); fprintf(stdout, error == POLLIN ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = EPOLL_MAX_CHAIN; tcfg.flags = 0; fprintf(stdout, "\n********** Testing long poll chain (HOLD ON)\n"); error = run_forked_proc(epoll_poll_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = EPOLL_TF_LOOP; fprintf(stdout, "\n********** Testing loopy wakeup chain (HOLD ON)\n"); error = run_forked_proc(epoll_wakeup_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = EPOLL_TF_LOOP; fprintf(stdout, "\n********** Testing loopy poll chain (HOLD ON)\n"); error = run_forked_proc(epoll_poll_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); return 0; } Signed-off-by: Davide Libenzi Cc: Pavel Pisa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 539 +++++++++++++++++++++++++++++-------------------- 1 file changed, 318 insertions(+), 221 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c5c424f23fd5..8a23a91e1377 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1,6 +1,6 @@ /* - * fs/eventpoll.c (Efficent event polling implementation) - * Copyright (C) 2001,...,2007 Davide Libenzi + * fs/eventpoll.c (Efficient event retrieval implementation) + * Copyright (C) 2001,...,2009 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -92,8 +92,8 @@ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) -/* Maximum number of poll wake up nests we are allowing */ -#define EP_MAX_POLLWAKE_NESTS 4 +/* Maximum number of nesting allowed inside epoll sets */ +#define EP_MAX_NESTS 4 /* Maximum msec timeout value storeable in a long int */ #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) @@ -110,24 +110,21 @@ struct epoll_filefd { }; /* - * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". - * It is used to keep track on all tasks that are currently inside the wake_up() code - * to 1) short-circuit the one coming from the same task and same wait queue head - * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting - * 3) let go the ones coming from other tasks. + * Structure used to track possible nested calls, for too deep recursions + * and loop cycles. */ -struct wake_task_node { +struct nested_call_node { struct list_head llink; struct task_struct *task; - wait_queue_head_t *wq; + void *cookie; }; /* - * This is used to implement the safe poll wake up avoiding to reenter - * the poll callback from inside wake_up(). + * This structure is used as collector for nested calls, to check for + * maximum recursion dept and loop cycles. */ -struct poll_safewake { - struct list_head wake_task_list; +struct nested_calls { + struct list_head tasks_call_list; spinlock_t lock; }; @@ -231,6 +228,12 @@ struct ep_pqueue { struct epitem *epi; }; +/* Used by the ep_send_events() function as callback private data */ +struct ep_send_events_data { + int maxevents; + struct epoll_event __user *events; +}; + /* * Configuration options available inside /proc/sys/fs/epoll/ */ @@ -242,8 +245,11 @@ static int max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); -/* Safe wake up implementation */ -static struct poll_safewake psw; +/* Used for safe wake up implementation */ +static struct nested_calls poll_safewake_ncalls; + +/* Used to call file's f_op->poll() under the nested calls boundaries */ +static struct nested_calls poll_readywalk_ncalls; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -312,11 +318,80 @@ static inline int ep_op_has_event(int op) } /* Initialize the poll safe wake up structure */ -static void ep_poll_safewake_init(struct poll_safewake *psw) +static void ep_nested_calls_init(struct nested_calls *ncalls) { + INIT_LIST_HEAD(&ncalls->tasks_call_list); + spin_lock_init(&ncalls->lock); +} - INIT_LIST_HEAD(&psw->wake_task_list); - spin_lock_init(&psw->lock); +/** + * ep_call_nested - Perform a bound (possibly) nested call, by checking + * that the recursion limit is not exceeded, and that + * the same nested call (by the meaning of same cookie) is + * no re-entered. + * + * @ncalls: Pointer to the nested_calls structure to be used for this call. + * @max_nests: Maximum number of allowed nesting calls. + * @nproc: Nested call core function pointer. + * @priv: Opaque data to be passed to the @nproc callback. + * @cookie: Cookie to be used to identify this nested call. + * + * Returns: Returns the code returned by the @nproc callback, or -1 if + * the maximum recursion limit has been exceeded. + */ +static int ep_call_nested(struct nested_calls *ncalls, int max_nests, + int (*nproc)(void *, void *, int), void *priv, + void *cookie) +{ + int error, call_nests = 0; + unsigned long flags; + struct task_struct *this_task = current; + struct list_head *lsthead = &ncalls->tasks_call_list; + struct nested_call_node *tncur; + struct nested_call_node tnode; + + spin_lock_irqsave(&ncalls->lock, flags); + + /* + * Try to see if the current task is already inside this wakeup call. + * We use a list here, since the population inside this set is always + * very much limited. + */ + list_for_each_entry(tncur, lsthead, llink) { + if (tncur->task == this_task && + (tncur->cookie == cookie || ++call_nests > max_nests)) { + /* + * Ops ... loop detected or maximum nest level reached. + * We abort this wake by breaking the cycle itself. + */ + spin_unlock_irqrestore(&ncalls->lock, flags); + + return -1; + } + } + + /* Add the current task and cookie to the list */ + tnode.task = this_task; + tnode.cookie = cookie; + list_add(&tnode.llink, lsthead); + + spin_unlock_irqrestore(&ncalls->lock, flags); + + /* Call the nested function */ + error = (*nproc)(priv, cookie, call_nests); + + /* Remove the current task from the list */ + spin_lock_irqsave(&ncalls->lock, flags); + list_del(&tnode.llink); + spin_unlock_irqrestore(&ncalls->lock, flags); + + return error; +} + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) +{ + wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests); + return 0; } /* @@ -324,52 +399,15 @@ static void ep_poll_safewake_init(struct poll_safewake *psw) * with the new callback'd wake up system, it is possible that the * poll callback is reentered from inside the call to wake_up() done * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, + * wake up code from the same task more than EP_MAX_NESTS times, * and we cannot reenter the same wait queue head at all. This will * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock - * because this one gets called by the poll callback, that in turn is called - * from inside a wake_up(), that might be called from irq context. + * EP_MAX_NESTS deep. */ -static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) +static void ep_poll_safewake(wait_queue_head_t *wq) { - int wake_nests = 0; - unsigned long flags; - struct task_struct *this_task = current; - struct list_head *lsthead = &psw->wake_task_list; - struct wake_task_node *tncur; - struct wake_task_node tnode; - - spin_lock_irqsave(&psw->lock, flags); - - /* Try to see if the current task is already inside this wakeup call */ - list_for_each_entry(tncur, lsthead, llink) { - - if (tncur->wq == wq || - (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) { - /* - * Ops ... loop detected or maximum nest level reached. - * We abort this wake by breaking the cycle itself. - */ - spin_unlock_irqrestore(&psw->lock, flags); - return; - } - } - - /* Add the current task to the list */ - tnode.task = this_task; - tnode.wq = wq; - list_add(&tnode.llink, lsthead); - - spin_unlock_irqrestore(&psw->lock, flags); - - /* Do really wake up now */ - wake_up_nested(wq, 1 + wake_nests); - - /* Remove the current task from the list */ - spin_lock_irqsave(&psw->lock, flags); - list_del(&tnode.llink); - spin_unlock_irqrestore(&psw->lock, flags); + ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, + ep_poll_wakeup_proc, NULL, wq); } /* @@ -397,6 +435,104 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) } } +/** + * ep_scan_ready_list - Scans the ready list in a way that makes possible for + * the scan code, to call f_op->poll(). Also allows for + * O(NumReady) performance. + * + * @ep: Pointer to the epoll private data structure. + * @sproc: Pointer to the scan callback. + * @priv: Private opaque data passed to the @sproc callback. + * + * Returns: The same integer error code returned by the @sproc callback. + */ +static int ep_scan_ready_list(struct eventpoll *ep, + int (*sproc)(struct eventpoll *, + struct list_head *, void *), + void *priv) +{ + int error, pwake = 0; + unsigned long flags; + struct epitem *epi, *nepi; + struct list_head txlist; + + INIT_LIST_HEAD(&txlist); + + /* + * We need to lock this because we could be hit by + * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). + */ + mutex_lock(&ep->mtx); + + /* + * Steal the ready list, and re-init the original one to the + * empty list. Also, set ep->ovflist to NULL so that events + * happening while looping w/out locks, are not lost. We cannot + * have the poll callback to queue directly on ep->rdllist, + * because we want the "sproc" callback to be able to do it + * in a lockless way. + */ + spin_lock_irqsave(&ep->lock, flags); + list_splice(&ep->rdllist, &txlist); + INIT_LIST_HEAD(&ep->rdllist); + ep->ovflist = NULL; + spin_unlock_irqrestore(&ep->lock, flags); + + /* + * Now call the callback function. + */ + error = (*sproc)(ep, &txlist, priv); + + spin_lock_irqsave(&ep->lock, flags); + /* + * During the time we spent inside the "sproc" callback, some + * other events might have been queued by the poll callback. + * We re-insert them inside the main ready-list here. + */ + for (nepi = ep->ovflist; (epi = nepi) != NULL; + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { + /* + * We need to check if the item is already in the list. + * During the "sproc" callback execution time, items are + * queued into ->ovflist but the "txlist" might already + * contain them, and the list_splice() below takes care of them. + */ + if (!ep_is_linked(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + } + /* + * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after + * releasing the lock, events will be queued in the normal way inside + * ep->rdllist. + */ + ep->ovflist = EP_UNACTIVE_PTR; + + /* + * Quickly re-inject items left on "txlist". + */ + list_splice(&txlist, &ep->rdllist); + + if (!list_empty(&ep->rdllist)) { + /* + * Wake up (if active) both the eventpoll wait list and the ->poll() + * wait list (delayed after we release the lock). + */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } + spin_unlock_irqrestore(&ep->lock, flags); + + mutex_unlock(&ep->mtx); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return error; +} + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -447,7 +583,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(&psw, &ep->poll_wait); + ep_poll_safewake(&ep->poll_wait); /* * We need to lock this because we could be hit by @@ -496,22 +632,49 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) +{ + struct epitem *epi, *tmp; + + list_for_each_entry_safe(epi, tmp, head, rdllink) { + if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events) + return POLLIN | POLLRDNORM; + else + /* + * Item has been dropped into the ready list by the poll + * callback, but it's not actually ready, as far as + * caller requested events goes. We can remove it here. + */ + list_del_init(&epi->rdllink); + } + + return 0; +} + +static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) +{ + return ep_scan_ready_list(priv, ep_read_events_proc, NULL); +} + static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { - unsigned int pollflags = 0; - unsigned long flags; + int pollflags; struct eventpoll *ep = file->private_data; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); - /* Check our condition */ - spin_lock_irqsave(&ep->lock, flags); - if (!list_empty(&ep->rdllist)) - pollflags = POLLIN | POLLRDNORM; - spin_unlock_irqrestore(&ep->lock, flags); + /* + * Proceed to find out if wanted events are really available inside + * the ready list. This need to be done under ep_call_nested() + * supervision, since the call to f_op->poll() done on listed files + * could re-enter here. + */ + pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, + ep_poll_readyevents_proc, ep, ep); - return pollflags; + return pollflags != -1 ? pollflags: 0; } /* File callbacks that implement the eventpoll file behaviour */ @@ -541,7 +704,7 @@ void eventpoll_release_file(struct file *file) * We don't want to get "file->f_lock" because it is not * necessary. It is not necessary because we're in the "struct file" * cleanup path, and this means that noone is using this file anymore. - * So, for example, epoll_ctl() cannot hit here sicne if we reach this + * So, for example, epoll_ctl() cannot hit here since if we reach this * point, the file counter already went to zero and fget() would fail. * The only hit might come from ep_free() but by holding the mutex * will correctly serialize the operation. We do need to acquire @@ -670,12 +833,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k } /* If this file is already in the ready list we exit soon */ - if (ep_is_linked(&epi->rdllink)) - goto is_linked; + if (!ep_is_linked(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); - list_add_tail(&epi->rdllink, &ep->rdllist); - -is_linked: /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. @@ -690,7 +850,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); + ep_poll_safewake(&ep->poll_wait); return 1; } @@ -712,10 +872,9 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; - } else { + } else /* We have to signal that an error occurred */ epi->nwait = -1; - } } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) @@ -817,7 +976,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); + ep_poll_safewake(&ep->poll_wait); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", current, ep, tfile, fd)); @@ -891,137 +1050,74 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); + ep_poll_safewake(&ep->poll_wait); return 0; } +static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) +{ + struct ep_send_events_data *esed = priv; + int eventcnt; + unsigned int revents; + struct epitem *epi; + struct epoll_event __user *uevent; + + /* + * We can loop without lock because we are passed a task private list. + * Items cannot vanish during the loop because ep_scan_ready_list() is + * holding "mtx" during this call. + */ + for (eventcnt = 0, uevent = esed->events; + !list_empty(head) && eventcnt < esed->maxevents;) { + epi = list_first_entry(head, struct epitem, rdllink); + + list_del_init(&epi->rdllink); + + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events; + + /* + * If the event mask intersect the caller-requested one, + * deliver the event to userspace. Again, ep_scan_ready_list() + * is holding "mtx", so no operations coming from userspace + * can change the item. + */ + if (revents) { + if (__put_user(revents, &uevent->events) || + __put_user(epi->event.data, &uevent->data)) + return eventcnt ? eventcnt: -EFAULT; + eventcnt++; + uevent++; + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) + /* + * If this file has been added with Level Trigger + * mode, we need to insert back inside the ready + * list, so that the next call to epoll_wait() + * will check again the events availability. + * At this point, noone can insert into ep->rdllist + * besides us. The epoll_ctl() callers are locked + * out by ep_scan_ready_list() holding "mtx" and + * the poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + } + } + + return eventcnt; +} + static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { - int eventcnt, error = -EFAULT, pwake = 0; - unsigned int revents; - unsigned long flags; - struct epitem *epi, *nepi; - struct list_head txlist; + struct ep_send_events_data esed; - INIT_LIST_HEAD(&txlist); + esed.maxevents = maxevents; + esed.events = events; - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). - */ - mutex_lock(&ep->mtx); - - /* - * Steal the ready list, and re-init the original one to the - * empty list. Also, set ep->ovflist to NULL so that events - * happening while looping w/out locks, are not lost. We cannot - * have the poll callback to queue directly on ep->rdllist, - * because we are doing it in the loop below, in a lockless way. - */ - spin_lock_irqsave(&ep->lock, flags); - list_splice(&ep->rdllist, &txlist); - INIT_LIST_HEAD(&ep->rdllist); - ep->ovflist = NULL; - spin_unlock_irqrestore(&ep->lock, flags); - - /* - * We can loop without lock because this is a task private list. - * We just splice'd out the ep->rdllist in ep_collect_ready_items(). - * Items cannot vanish during the loop because we are holding "mtx". - */ - for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { - epi = list_first_entry(&txlist, struct epitem, rdllink); - - list_del_init(&epi->rdllink); - - /* - * Get the ready file event set. We can safely use the file - * because we are holding the "mtx" and this will guarantee - * that both the file and the item will not vanish. - */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); - revents &= epi->event.events; - - /* - * Is the event mask intersect the caller-requested one, - * deliver the event to userspace. Again, we are holding - * "mtx", so no operations coming from userspace can change - * the item. - */ - if (revents) { - if (__put_user(revents, - &events[eventcnt].events) || - __put_user(epi->event.data, - &events[eventcnt].data)) - goto errxit; - if (epi->event.events & EPOLLONESHOT) - epi->event.events &= EP_PRIVATE_BITS; - eventcnt++; - } - /* - * At this point, noone can insert into ep->rdllist besides - * us. The epoll_ctl() callers are locked out by us holding - * "mtx" and the poll callback will queue them in ep->ovflist. - */ - if (!(epi->event.events & EPOLLET) && - (revents & epi->event.events)) - list_add_tail(&epi->rdllink, &ep->rdllist); - } - error = 0; - -errxit: - - spin_lock_irqsave(&ep->lock, flags); - /* - * During the time we spent in the loop above, some other events - * might have been queued by the poll callback. We re-insert them - * inside the main ready-list here. - */ - for (nepi = ep->ovflist; (epi = nepi) != NULL; - nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { - /* - * If the above loop quit with errors, the epoll item might still - * be linked to "txlist", and the list_splice() done below will - * take care of those cases. - */ - if (!ep_is_linked(&epi->rdllink)) - list_add_tail(&epi->rdllink, &ep->rdllist); - } - /* - * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after - * releasing the lock, events will be queued in the normal way inside - * ep->rdllist. - */ - ep->ovflist = EP_UNACTIVE_PTR; - - /* - * In case of error in the event-send loop, or in case the number of - * ready events exceeds the userspace limit, we need to splice the - * "txlist" back inside ep->rdllist. - */ - list_splice(&txlist, &ep->rdllist); - - if (!list_empty(&ep->rdllist)) { - /* - * Wake up (if active) both the eventpoll wait list and the ->poll() - * wait list (delayed after we release the lock). - */ - if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); - if (waitqueue_active(&ep->poll_wait)) - pwake++; - } - spin_unlock_irqrestore(&ep->lock, flags); - - mutex_unlock(&ep->mtx); - - /* We have to call this outside the lock */ - if (pwake) - ep_poll_safewake(&psw, &ep->poll_wait); - - return eventcnt == 0 ? error: eventcnt; + return ep_scan_ready_list(ep, ep_send_events_proc, &esed); } static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, @@ -1033,7 +1129,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, wait_queue_t wait; /* - * Calculate the timeout by checking for the "infinite" value ( -1 ) + * Calculate the timeout by checking for the "infinite" value (-1) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ @@ -1076,9 +1172,8 @@ retry: set_current_state(TASK_RUNNING); } - /* Is it worth to try to dig for events ? */ - eavail = !list_empty(&ep->rdllist); + eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; spin_unlock_irqrestore(&ep->lock, flags); @@ -1099,41 +1194,40 @@ retry: */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error, fd = -1; - struct eventpoll *ep; + int error; + struct eventpoll *ep = NULL; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); - if (flags & ~EPOLL_CLOEXEC) - return -EINVAL; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, flags)); + error = -EINVAL; + if (flags & ~EPOLL_CLOEXEC) + goto error_return; + /* - * Create the internal data structure ( "struct eventpoll" ). + * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); - if (error < 0) { - fd = error; + if (error < 0) goto error_return; - } /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); - if (fd < 0) + error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + flags & O_CLOEXEC); + if (error < 0) ep_free(ep); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, flags, fd)); + current, flags, error)); - return fd; + return error; } SYSCALL_DEFINE1(epoll_create, int, size) @@ -1359,7 +1453,10 @@ static int __init eventpoll_init(void) EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ - ep_poll_safewake_init(&psw); + ep_nested_calls_init(&poll_safewake_ncalls); + + /* Initialize the structure used to perform file's f_op->poll() calls */ + ep_nested_calls_init(&poll_readywalk_ncalls); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), From 296e236e96dddef351a1809c0d414bcddfcf3800 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:11 -0700 Subject: [PATCH 364/478] epoll: fix epoll's own poll (update) Signed-off-by: Davide Libenzi Cc: Pavel Pisa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 110 +++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 53 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a23a91e1377..e24d137c93b6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -454,9 +454,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, int error, pwake = 0; unsigned long flags; struct epitem *epi, *nepi; - struct list_head txlist; - - INIT_LIST_HEAD(&txlist); + LIST_HEAD(txlist); /* * We need to lock this because we could be hit by @@ -473,8 +471,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * in a lockless way. */ spin_lock_irqsave(&ep->lock, flags); - list_splice(&ep->rdllist, &txlist); - INIT_LIST_HEAD(&ep->rdllist); + list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; spin_unlock_irqrestore(&ep->lock, flags); @@ -514,8 +511,8 @@ static int ep_scan_ready_list(struct eventpoll *ep, if (!list_empty(&ep->rdllist)) { /* - * Wake up (if active) both the eventpoll wait list and the ->poll() - * wait list (delayed after we release the lock). + * Wake up (if active) both the eventpoll wait list and + * the ->poll() wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); @@ -632,7 +629,8 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) { struct epitem *epi, *tmp; @@ -640,13 +638,14 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, voi if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & epi->event.events) return POLLIN | POLLRDNORM; - else + else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ list_del_init(&epi->rdllink); + } } return 0; @@ -674,7 +673,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, ep_poll_readyevents_proc, ep, ep); - return pollflags != -1 ? pollflags: 0; + return pollflags != -1 ? pollflags : 0; } /* File callbacks that implement the eventpoll file behaviour */ @@ -872,9 +871,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; - } else + } else { /* We have to signal that an error occurred */ epi->nwait = -1; + } } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) @@ -1055,62 +1055,65 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even return 0; } -static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) +static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) { struct ep_send_events_data *esed = priv; int eventcnt; - unsigned int revents; + unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; - /* + /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. - */ + */ for (eventcnt = 0, uevent = esed->events; !list_empty(head) && eventcnt < esed->maxevents;) { epi = list_first_entry(head, struct epitem, rdllink); list_del_init(&epi->rdllink); - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & - epi->event.events; + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events; - /* + /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() * is holding "mtx", so no operations coming from userspace * can change the item. - */ - if (revents) { + */ + if (revents) { if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) - return eventcnt ? eventcnt: -EFAULT; - eventcnt++; + return eventcnt ? eventcnt : -EFAULT; + eventcnt++; uevent++; - if (epi->event.events & EPOLLONESHOT) - epi->event.events &= EP_PRIVATE_BITS; - else if (!(epi->event.events & EPOLLET)) - /* - * If this file has been added with Level Trigger - * mode, we need to insert back inside the ready - * list, so that the next call to epoll_wait() - * will check again the events availability. - * At this point, noone can insert into ep->rdllist - * besides us. The epoll_ctl() callers are locked - * out by ep_scan_ready_list() holding "mtx" and - * the poll callback will queue them in ep->ovflist. - */ - list_add_tail(&epi->rdllink, &ep->rdllist); - } - } + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, noone can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + } + } + } return eventcnt; } -static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, - int maxevents) +static int ep_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) { struct ep_send_events_data esed; @@ -1194,40 +1197,41 @@ retry: */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; - struct eventpoll *ep = NULL; + int error, fd = -1; + struct eventpoll *ep; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); + if (flags & ~EPOLL_CLOEXEC) + return -EINVAL; + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, flags)); - error = -EINVAL; - if (flags & ~EPOLL_CLOEXEC) - goto error_return; - /* - * Create the internal data structure ("struct eventpoll"). + * Create the internal data structure ( "struct eventpoll" ). */ error = ep_alloc(&ep); - if (error < 0) + if (error < 0) { + fd = error; goto error_return; + } /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); - if (error < 0) + fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + flags & O_CLOEXEC); + if (fd < 0) ep_free(ep); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, flags, error)); + current, flags, fd)); - return error; + return fd; } SYSCALL_DEFINE1(epoll_create, int, size) From bb57c3edcd2fc51d95914c39448f36e43af9d6af Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:12 -0700 Subject: [PATCH 365/478] epoll: remove debugging code Remove debugging code from epoll. There's no need for it to be included into mainline code. Signed-off-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 78 +++++++------------------------------------------- 1 file changed, 11 insertions(+), 67 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e24d137c93b6..205a1e1c77c7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -71,24 +71,6 @@ * a better scalability. */ -#define DEBUG_EPOLL 0 - -#if DEBUG_EPOLL > 0 -#define DPRINTK(x) printk x -#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0) -#else /* #if DEBUG_EPOLL > 0 */ -#define DPRINTK(x) (void) 0 -#define DNPRINTK(n, x) (void) 0 -#endif /* #if DEBUG_EPOLL > 0 */ - -#define DEBUG_EPI 0 - -#if DEBUG_EPI != 0 -#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) -#else /* #if DEBUG_EPI != 0 */ -#define EPI_SLAB_DEBUG 0 -#endif /* #if DEBUG_EPI != 0 */ - /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) @@ -567,9 +549,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) atomic_dec(&ep->user->epoll_watches); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", - current, ep, file)); - return 0; } @@ -625,7 +604,6 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) if (ep) ep_free(ep); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); return 0; } @@ -750,8 +728,6 @@ static int ep_alloc(struct eventpoll **pep) *pep = ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", - current, ep)); return 0; free_uid: @@ -785,9 +761,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) } } - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", - current, file, epir)); - return epir; } @@ -803,9 +776,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", - current, epi->ffd.file, epi, ep)); - spin_lock_irqsave(&ep->lock, flags); /* @@ -978,9 +948,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, if (pwake) ep_poll_safewake(&ep->poll_wait); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", - current, ep, tfile, fd)); - return 0; error_unregister: @@ -1197,41 +1164,30 @@ retry: */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error, fd = -1; - struct eventpoll *ep; + int error; + struct eventpoll *ep = NULL; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", - current, flags)); - /* - * Create the internal data structure ( "struct eventpoll" ). + * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); - if (error < 0) { - fd = error; - goto error_return; - } - + if (error < 0) + return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); - if (fd < 0) + error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + flags & O_CLOEXEC); + if (error < 0) ep_free(ep); -error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, flags, fd)); - - return fd; + return error; } SYSCALL_DEFINE1(epoll_create, int, size) @@ -1256,9 +1212,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epitem *epi; struct epoll_event epds; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", - current, epfd, op, fd, event)); - error = -EFAULT; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) @@ -1335,8 +1288,6 @@ error_tgt_fput: error_fput: fput(file); error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", - current, epfd, op, fd, event, error)); return error; } @@ -1352,9 +1303,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, struct file *file; struct eventpoll *ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", - current, epfd, events, maxevents, timeout)); - /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; @@ -1391,8 +1339,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, error_fput: fput(file); error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", - current, epfd, events, maxevents, timeout, error)); return error; } @@ -1464,13 +1410,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, - NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, - EPI_SLAB_DEBUG|SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); return 0; } From abff55cee1039b5a3b96f7a5eb6e65b9f247a274 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Tue, 31 Mar 2009 15:24:13 -0700 Subject: [PATCH 366/478] epoll: don't use current in irq context ep_call_nested() (formerly ep_poll_safewake()) uses "current" (without dereferencing it) to detect callback recursion, but it may be called from irq context where the use of current is generally discouraged. It would be better to use get_cpu() and put_cpu() to detect the callback recursion. Signed-off-by: Tony Battersby Acked-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 205a1e1c77c7..db4365f8a75c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -97,8 +97,8 @@ struct epoll_filefd { */ struct nested_call_node { struct list_head llink; - struct task_struct *task; void *cookie; + int cpu; }; /* @@ -327,7 +327,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, { int error, call_nests = 0; unsigned long flags; - struct task_struct *this_task = current; + int this_cpu = get_cpu(); struct list_head *lsthead = &ncalls->tasks_call_list; struct nested_call_node *tncur; struct nested_call_node tnode; @@ -340,20 +340,19 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, * very much limited. */ list_for_each_entry(tncur, lsthead, llink) { - if (tncur->task == this_task && + if (tncur->cpu == this_cpu && (tncur->cookie == cookie || ++call_nests > max_nests)) { /* * Ops ... loop detected or maximum nest level reached. * We abort this wake by breaking the cycle itself. */ - spin_unlock_irqrestore(&ncalls->lock, flags); - - return -1; + error = -1; + goto out_unlock; } } /* Add the current task and cookie to the list */ - tnode.task = this_task; + tnode.cpu = this_cpu; tnode.cookie = cookie; list_add(&tnode.llink, lsthead); @@ -365,8 +364,10 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, /* Remove the current task from the list */ spin_lock_irqsave(&ncalls->lock, flags); list_del(&tnode.llink); + out_unlock: spin_unlock_irqrestore(&ncalls->lock, flags); + put_cpu(); return error; } From d0305882825784e74f68a56eee6c3a812a99f235 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Tue, 31 Mar 2009 15:24:14 -0700 Subject: [PATCH 367/478] epoll: remember the event if epoll_wait returns -EFAULT If epoll_wait returns -EFAULT, the event that was being returned when the fault was encountered will be forgotten. This is not a big deal since EFAULT will happen only if a buggy userspace program passes in a bad address, in which case what happens later usually doesn't matter. However, it is easy to remember the event for later, and this patch makes a simple change to do that. Signed-off-by: Tony Battersby Acked-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index db4365f8a75c..c806a0c4383c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1054,8 +1054,10 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, */ if (revents) { if (__put_user(revents, &uevent->events) || - __put_user(epi->event.data, &uevent->data)) + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); return eventcnt ? eventcnt : -EFAULT; + } eventcnt++; uevent++; if (epi->event.events & EPOLLONESHOT) From d1bc90dd5d037079f96b3327f943eb6ae8ef7491 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Tue, 31 Mar 2009 15:24:15 -0700 Subject: [PATCH 368/478] epoll: remove unnecessary xchg xchg in ep_unregister_pollwait() is unnecessary because it is protected by either epmutex or ep->mtx (the same protection as ep_remove()). If xchg was necessary, it would be insufficient to protect against problems: if multiple concurrent calls to ep_unregister_pollwait() were possible then a second caller that returns without doing anything because nwait == 0 could return before the waitqueues are removed by the first caller, which looks like it could lead to problematic races with ep_poll_callback(). So remove xchg and add comments about the locking. Signed-off-by: Tony Battersby Acked-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c806a0c4383c..64c55037c13b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -394,27 +394,21 @@ static void ep_poll_safewake(wait_queue_head_t *wq) } /* - * This function unregister poll callbacks from the associated file descriptor. - * Since this must be called without holding "ep->lock" the atomic exchange trick - * will protect us from multiple unregister. + * This function unregisters poll callbacks from the associated file + * descriptor. Must be called with "mtx" held (or "epmutex" if called from + * ep_free). */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { - int nwait; struct list_head *lsthead = &epi->pwqlist; struct eppoll_entry *pwq; - /* This is called without locks, so we need the atomic exchange */ - nwait = xchg(&epi->nwait, 0); + while (!list_empty(lsthead)) { + pwq = list_first_entry(lsthead, struct eppoll_entry, llink); - if (nwait) { - while (!list_empty(lsthead)) { - pwq = list_first_entry(lsthead, struct eppoll_entry, llink); - - list_del_init(&pwq->llink); - remove_wait_queue(pwq->whead, &pwq->wait); - kmem_cache_free(pwq_cache, pwq); - } + list_del(&pwq->llink); + remove_wait_queue(pwq->whead, &pwq->wait); + kmem_cache_free(pwq_cache, pwq); } } From e057e15ff66a620eda4c407486cbb8f8fbb7d878 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Tue, 31 Mar 2009 15:24:15 -0700 Subject: [PATCH 369/478] epoll: clean up ep_modify ep_modify() doesn't need to set event.data from within the ep->lock spinlock as the comment suggests. The only place event.data is used is ep_send_events_proc(), and this is protected by ep->mtx instead of ep->lock. Also update the comment for mutex_lock() at the top of ep_scan_ready_list(), which mentions epoll_ctl(EPOLL_CTL_DEL) but not epoll_ctl(EPOLL_CTL_MOD). ep_modify() can also use spin_lock_irq() instead of spin_lock_irqsave(). Signed-off-by: Tony Battersby Acked-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 64c55037c13b..744377c01869 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -435,7 +435,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, /* * We need to lock this because we could be hit by - * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). + * eventpoll_release_file() and epoll_ctl(). */ mutex_lock(&ep->mtx); @@ -972,15 +972,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even { int pwake = 0; unsigned int revents; - unsigned long flags; /* - * Set the new event interest mask before calling f_op->poll(), otherwise - * a potential race might occur. In fact if we do this operation inside - * the lock, an event might happen between the f_op->poll() call and the - * new event set registering. + * Set the new event interest mask before calling f_op->poll(); + * otherwise we might miss an event that happens between the + * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; + epi->event.data = event->data; /* protected by mtx */ /* * Get current event bits. We can safely use the file* here because @@ -988,16 +987,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); - spin_lock_irqsave(&ep->lock, flags); - - /* Copy the data member from inside the lock */ - epi->event.data = event->data; - /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (revents & event->events) { + spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1007,8 +1002,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even if (waitqueue_active(&ep->poll_wait)) pwake++; } + spin_unlock_irq(&ep->lock); } - spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) From 4f0989dbfa8d18dd17c32120aac1eb3e906a62a2 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Tue, 31 Mar 2009 15:24:16 -0700 Subject: [PATCH 370/478] epoll: use real type instead of void * eventpoll.c uses void * in one place for no obvious reason; change it to use the real type instead. Signed-off-by: Tony Battersby Acked-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 744377c01869..336fdb8ed47b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -192,7 +192,7 @@ struct eppoll_entry { struct list_head llink; /* The "base" pointer is set to the container "struct epitem" */ - void *base; + struct epitem *base; /* * Wait queue item that will be linked to the target file wait From bcd0b235bf3808dec5115c381cd55568f63b85f0 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:18 -0700 Subject: [PATCH 371/478] eventfd: improve support for semaphore-like behavior People started using eventfd in a semaphore-like way where before they were using pipes. That is, counter-based resource access. Where a "wait()" returns immediately by decrementing the counter by one, if counter is greater than zero. Otherwise will wait. And where a "post(count)" will add count to the counter releasing the appropriate amount of waiters. If eventfd the "post" (write) part is fine, while the "wait" (read) does not dequeue 1, but the whole counter value. The problem with eventfd is that a read() on the fd returns and wipes the whole counter, making the use of it as semaphore a little bit more cumbersome. You can do a read() followed by a write() of COUNTER-1, but IMO it's pretty easy and cheap to make this work w/out extra steps. This patch introduces a new eventfd flag that tells eventfd to only dequeue 1 from the counter, allowing simple read/write to make it behave like a semaphore. Simple test here: http://www.xmailserver.org/eventfd-sem.c To be back-compatible with earlier kernels, userspace applications should probe for the availability of this feature via #ifdef EFD_SEMAPHORE fd = eventfd2 (CNT, EFD_SEMAPHORE); if (fd == -1 && errno == EINVAL) #else #endif Signed-off-by: Davide Libenzi Cc: Tested-by: Michael Kerrisk Cc: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventfd.c | 20 +++++++++++--------- include/linux/eventfd.h | 12 +++++++++++- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 5de2c2db3aa2..91c0829a7035 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -28,6 +28,7 @@ struct eventfd_ctx { * issue a wakeup. */ __u64 count; + unsigned int flags; }; /* @@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, { struct eventfd_ctx *ctx = file->private_data; ssize_t res; - __u64 ucnt; + __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; - ucnt = ctx->count; - if (ucnt > 0) + if (ctx->count > 0) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - ucnt = ctx->count; res = sizeof(ucnt); break; } @@ -117,8 +116,9 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (res > 0) { - ctx->count = 0; + if (likely(res > 0)) { + ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= ucnt; if (waitqueue_active(&ctx->wqh)) wake_up_locked(&ctx->wqh); } @@ -166,7 +166,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (res > 0) { + if (likely(res > 0)) { ctx->count += ucnt; if (waitqueue_active(&ctx->wqh)) wake_up_locked(&ctx->wqh); @@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); - if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK)) + if (flags & ~EFD_FLAGS_SET) return -EINVAL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); @@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; + ctx->flags = flags; /* * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + flags & EFD_SHARED_FCNTL_FLAGS); if (fd < 0) kfree(ctx); return fd; @@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count) { return sys_eventfd2(count, 0); } + diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index a667637b54e3..f45a8ae5f828 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -13,10 +13,20 @@ /* For O_CLOEXEC and O_NONBLOCK */ #include -/* Flags for eventfd2. */ +/* + * CAREFUL: Check include/asm-generic/fcntl.h when defining + * new flags, since they might collide with O_* ones. We want + * to re-use O_* flags that couldn't possibly have a meaning + * from eventfd, in order to leave a free define-space for + * shared O_* flags. + */ +#define EFD_SEMAPHORE (1 << 0) #define EFD_CLOEXEC O_CLOEXEC #define EFD_NONBLOCK O_NONBLOCK +#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) +#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) + struct file *eventfd_fget(int fd); int eventfd_signal(struct file *file, int n); From 4ede816ac36e027db5fe0051ad9c73f76db63772 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:20 -0700 Subject: [PATCH 372/478] epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key() This patchset introduces wakeup hints for some of the most popular (from epoll POV) devices, so that epoll code can avoid spurious wakeups on its waiters. The problem with epoll is that the callback-based wakeups do not, ATM, carry any information about the events the wakeup is related to. So the only choice epoll has (not being able to call f_op->poll() from inside the callback), is to add the file* to a ready-list and resolve the real events later on, at epoll_wait() (or its own f_op->poll()) time. This can cause spurious wakeups, since the wake_up() itself might be for an event the caller is not interested into. The rate of these spurious wakeup can be pretty high in case of many network sockets being monitored. By allowing devices to report the events the wakeups refer to (at least the two major classes - POLLIN/POLLOUT), we are able to spare useless wakeups by proper handling inside the epoll's poll callback. Epoll will have in any case to call f_op->poll() on the file* later on, since the change to be done in order to have the full event set sent via wakeup, is too invasive for the way our f_op->poll() system works (the full event set is calculated inside the poll function - there are too many of them to even start thinking the change - also poll/select would need change too). Epoll is changed in a way that both devices which send event hints, and the ones that don't, are correctly handled. The former will gain some efficiency though. As a general rule for devices, would be to add an event mask by using key-aware wakeup macros, when making up poll wait queues. I tested it (together with the epoll's poll fix patch Andrew has in -mm) and wakeups for the supported devices are correctly filtered. Test program available here: http://www.xmailserver.org/epoll_test.c This patch: Nothing revolutionary here. Just using the available "key" that our wakeup core already support. The __wake_up_locked_key() was no brainer, since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers around __wake_up_common(). The __wake_up_sync() function had a body, so the choice was between borrowing the body for __wake_up_sync_key() and calling it from __wake_up_sync(), or make an inline and calling it from both. I chose the former since in most archs it all resolves to "mov $0, REG; jmp ADDR". Signed-off-by: Davide Libenzi Cc: Alan Cox Cc: Ingo Molnar Cc: David Miller Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 7 +++++-- kernel/sched.c | 23 +++++++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index a210ede73b56..0d2eeb03a718 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -135,8 +135,11 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); -extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, + void *key); +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); diff --git a/kernel/sched.c b/kernel/sched.c index 196d48babbef..73513f4e19df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5196,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) __wake_up_common(q, mode, 1, 0, NULL); } +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} + /** - * __wake_up_sync - wake up threads blocked on a waitqueue. + * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not @@ -5209,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) { unsigned long flags; int sync = 1; @@ -5222,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) sync = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); + __wake_up_common(q, mode, nr_exclusive, sync, key); spin_unlock_irqrestore(&q->lock, flags); } +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ /** From c0da37753695e010776ccf2200a5731e0f88a9f3 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:20 -0700 Subject: [PATCH 373/478] epoll keyed wakeups: introduce new *_poll() wakeup macros Introduce new wakeup macros that allow passing an event mask to the wakeup targets. They exactly mimic their non-_poll() counterpart, with the added event mask passing capability. I did add only the ones currently requested, avoiding the _nr() and _all() for the moment. Signed-off-by: Davide Libenzi Cc: Alan Cox Cc: Ingo Molnar Cc: David Miller Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 0d2eeb03a718..5d631c17eaee 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -158,21 +158,17 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE, 1) -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* - * macro to avoid include hell + * Wakeup macros to be used to report events to the targets. */ -#define wake_up_nested(x, s) \ -do { \ - unsigned long flags; \ - \ - spin_lock_irqsave_nested(&(x)->lock, flags, (s)); \ - wake_up_locked(x); \ - spin_unlock_irqrestore(&(x)->lock, flags); \ -} while (0) -#else -#define wake_up_nested(x, s) wake_up(x) -#endif +#define wake_up_poll(x, m) \ + __wake_up(x, TASK_NORMAL, 1, (void *) (m)) +#define wake_up_locked_poll(x, m) \ + __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) +#define wake_up_interruptible_poll(x, m) \ + __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) +#define wake_up_interruptible_sync_poll(x, m) \ + __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) #define __wait_event(wq, condition) \ do { \ From 37e5540b3c9d838eb20f2ca8ea2eb8072271e403 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:21 -0700 Subject: [PATCH 374/478] epoll keyed wakeups: make sockets use keyed wakeups Add support for event-aware wakeups to the sockets code. Events are delivered to the wakeup target, so that epoll can avoid spurious wakeups for non-interesting events. Signed-off-by: Davide Libenzi Acked-by: Alan Cox Cc: Ingo Molnar Cc: David Miller Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/sock.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 0620046e4eba..7dbf3ffb35cc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1677,7 +1677,7 @@ static void sock_def_error_report(struct sock *sk) { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_poll(sk->sk_sleep, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); read_unlock(&sk->sk_callback_lock); } @@ -1686,7 +1686,8 @@ static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_sync(sk->sk_sleep); + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | + POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&sk->sk_callback_lock); } @@ -1700,7 +1701,8 @@ static void sock_def_write_space(struct sock *sk) */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_sync(sk->sk_sleep); + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | + POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) From 2dfa4eeab0fc7e8633974f2770945311b31eedf6 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:22 -0700 Subject: [PATCH 375/478] epoll keyed wakeups: teach epoll about hints coming with the wakeup key Use the events hint now sent by some devices, to avoid unnecessary wakeups for events that are of no interest for the caller. This code handles both devices that are sending keyed events, and the ones that are not (and event the ones that sometimes send events, and sometimes don't). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davide Libenzi Cc: Alan Cox Cc: Ingo Molnar Cc: David Miller Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 336fdb8ed47b..a89f370fadb5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -371,9 +371,28 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, return error; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + unsigned long flags; + + spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); + wake_up_locked_poll(wqueue, events); + spin_unlock_irqrestore(&wqueue->lock, flags); +} +#else +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + wake_up_poll(wqueue, events); +} +#endif + static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { - wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests); + ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, + 1 + call_nests); return 0; } @@ -782,6 +801,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; + /* + * Check the events coming with the callback. At this stage, not + * every device reports the events in the "key" parameter of the + * callback. We need to be able to handle both cases here, hence the + * test for "key" != NULL before the event match test. + */ + if (key && !((unsigned long) key & epi->event.events)) + goto out_unlock; + /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() @@ -1254,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; From 395108880efff4a4ffa1ffa554477f7f5ba6a031 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:23 -0700 Subject: [PATCH 376/478] epoll keyed wakeups: make eventfd use keyed wakeups Introduce keyed event wakeups inside the eventfd code. Signed-off-by: Davide Libenzi Cc: Alan Cox Cc: Ingo Molnar Cc: David Miller Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 91c0829a7035..2a701d593d35 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -51,7 +51,7 @@ int eventfd_signal(struct file *file, int n) n = (int) (ULLONG_MAX - ctx->count); ctx->count += n; if (waitqueue_active(&ctx->wqh)) - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, POLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; @@ -120,7 +120,7 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; ctx->count -= ucnt; if (waitqueue_active(&ctx->wqh)) - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) @@ -169,7 +169,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c if (likely(res > 0)) { ctx->count += ucnt; if (waitqueue_active(&ctx->wqh)) - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, POLLIN); } spin_unlock_irq(&ctx->wqh.lock); From 4b19449db074eec86ae31a96d3cdca4aa7f138ab Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:24 -0700 Subject: [PATCH 377/478] epoll keyed wakeups: make tty use keyed wakeups Introduce keyed event wakeups inside the TTY code. Signed-off-by: Davide Libenzi Cc: Alan Cox Cc: Ingo Molnar Cc: David Miller Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/tty_io.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 224f271d8cbe..33dac94922a7 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -464,7 +464,7 @@ void tty_wakeup(struct tty_struct *tty) tty_ldisc_deref(ld); } } - wake_up_interruptible(&tty->write_wait); + wake_up_interruptible_poll(&tty->write_wait, POLLOUT); } EXPORT_SYMBOL_GPL(tty_wakeup); @@ -587,8 +587,8 @@ static void do_tty_hangup(struct work_struct *work) * FIXME: Once we trust the LDISC code better we can wait here for * ldisc completion and fix the driver call race */ - wake_up_interruptible(&tty->write_wait); - wake_up_interruptible(&tty->read_wait); + wake_up_interruptible_poll(&tty->write_wait, POLLOUT); + wake_up_interruptible_poll(&tty->read_wait, POLLIN); /* * Shutdown the current line discipline, and reset it to * N_TTY. @@ -879,7 +879,7 @@ void stop_tty(struct tty_struct *tty) if (tty->link && tty->link->packet) { tty->ctrl_status &= ~TIOCPKT_START; tty->ctrl_status |= TIOCPKT_STOP; - wake_up_interruptible(&tty->link->read_wait); + wake_up_interruptible_poll(&tty->link->read_wait, POLLIN); } spin_unlock_irqrestore(&tty->ctrl_lock, flags); if (tty->ops->stop) @@ -913,7 +913,7 @@ void start_tty(struct tty_struct *tty) if (tty->link && tty->link->packet) { tty->ctrl_status &= ~TIOCPKT_STOP; tty->ctrl_status |= TIOCPKT_START; - wake_up_interruptible(&tty->link->read_wait); + wake_up_interruptible_poll(&tty->link->read_wait, POLLIN); } spin_unlock_irqrestore(&tty->ctrl_lock, flags); if (tty->ops->start) @@ -970,7 +970,7 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count, void tty_write_unlock(struct tty_struct *tty) { mutex_unlock(&tty->atomic_write_lock); - wake_up_interruptible(&tty->write_wait); + wake_up_interruptible_poll(&tty->write_wait, POLLOUT); } int tty_write_lock(struct tty_struct *tty, int ndelay) @@ -1623,21 +1623,21 @@ void tty_release_dev(struct file *filp) if (tty_closing) { if (waitqueue_active(&tty->read_wait)) { - wake_up(&tty->read_wait); + wake_up_poll(&tty->read_wait, POLLIN); do_sleep++; } if (waitqueue_active(&tty->write_wait)) { - wake_up(&tty->write_wait); + wake_up_poll(&tty->write_wait, POLLOUT); do_sleep++; } } if (o_tty_closing) { if (waitqueue_active(&o_tty->read_wait)) { - wake_up(&o_tty->read_wait); + wake_up_poll(&o_tty->read_wait, POLLIN); do_sleep++; } if (waitqueue_active(&o_tty->write_wait)) { - wake_up(&o_tty->write_wait); + wake_up_poll(&o_tty->write_wait, POLLOUT); do_sleep++; } } From 2b872903c5d66bccdf4306a35e3e94d4da8555d3 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 31 Mar 2009 15:24:25 -0700 Subject: [PATCH 378/478] hp_accel: small documentation updates Fix english in Documentation, add "how to test" description. Signed-off-by: Pavel Machek Cc: Eric Piel Cc: Vladimir Botka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/hwmon/lis3lv02d | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d index 287f8c902656..effe949a7282 100644 --- a/Documentation/hwmon/lis3lv02d +++ b/Documentation/hwmon/lis3lv02d @@ -1,11 +1,11 @@ Kernel driver lis3lv02d -================== +======================= Supported chips: * STMicroelectronics LIS3LV02DL and LIS3LV02DQ -Author: +Authors: Yan Burman Eric Piel @@ -15,7 +15,7 @@ Description This driver provides support for the accelerometer found in various HP laptops sporting the feature officially called "HP Mobile Data -Protection System 3D" or "HP 3D DriveGuard". It detect automatically +Protection System 3D" or "HP 3D DriveGuard". It detects automatically laptops with this sensor. Known models (for now the HP 2133, nc6420, nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis automatically oriented on standard way (eg: you can directly play @@ -27,7 +27,7 @@ position - 3D position that the accelerometer reports. Format: "(x,y,z)" calibrate - read: values (x, y, z) that are used as the base for input class device operation. write: forces the base to be recalibrated with the current - position. + position. rate - reports the sampling rate of the accelerometer device in HZ This driver also provides an absolute input class device, allowing @@ -48,7 +48,7 @@ For better compatibility between the various laptops. The values reported by the accelerometer are converted into a "standard" organisation of the axes (aka "can play neverball out of the box"): * When the laptop is horizontal the position reported is about 0 for X and Y -and a positive value for Z + and a positive value for Z * If the left side is elevated, X increases (becomes positive) * If the front side (where the touchpad is) is elevated, Y decreases (becomes negative) @@ -59,3 +59,13 @@ email to the authors to add it to the database. When reporting a new laptop, please include the output of "dmidecode" plus the value of /sys/devices/platform/lis3lv02d/position in these four cases. +Q&A +--- + +Q: How do I safely simulate freefall? I have an HP "portable +workstation" which has about 3.5kg and a plastic case, so letting it +fall to the ground is out of question... + +A: The sensor is pretty sensitive, so your hands can do it. Lift it +into free space, follow the fall with your hands for like 10 +centimeters. That should be enough to trigger the detection. From be84cfc588b19f14764d78556dc7b630ee8c914c Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 31 Mar 2009 15:24:26 -0700 Subject: [PATCH 379/478] hp_accel: adev is poor name of exported symbol As Andrew noted, adev is pretty poor name for symbol being exported. Rename it to lis3. Signed-off-by: Pavel Machek Cc: Eric Piel Cc: Vladimir Botka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/hp_accel.c | 48 +++++------ drivers/hwmon/lis3lv02d.c | 172 ++++++++++++++++++++------------------ drivers/hwmon/lis3lv02d.h | 2 +- 3 files changed, 116 insertions(+), 106 deletions(-) diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c index 29c83b5b9697..ee1ed4270df7 100644 --- a/drivers/hwmon/hp_accel.c +++ b/drivers/hwmon/hp_accel.c @@ -140,7 +140,7 @@ acpi_status lis3lv02d_acpi_write(acpi_handle handle, int reg, u8 val) static int lis3lv02d_dmi_matched(const struct dmi_system_id *dmi) { - adev.ac = *((struct axis_conversion *)dmi->driver_data); + lis3_dev.ac = *((struct axis_conversion *)dmi->driver_data); printk(KERN_INFO DRIVER_NAME ": hardware type %s found.\n", dmi->ident); return 1; @@ -214,7 +214,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = { static void hpled_set(struct delayed_led_classdev *led_cdev, enum led_brightness value) { - acpi_handle handle = adev.device->handle; + acpi_handle handle = lis3_dev.device->handle; unsigned long long ret; /* Not used when writing */ union acpi_object in_obj[1]; struct acpi_object_list args = { 1, in_obj }; @@ -254,7 +254,7 @@ static void lis3lv02d_enum_resources(struct acpi_device *device) acpi_status status; status = acpi_walk_resources(device->handle, METHOD_NAME__CRS, - lis3lv02d_get_resource, &adev.irq); + lis3lv02d_get_resource, &lis3_dev.irq); if (ACPI_FAILURE(status)) printk(KERN_DEBUG DRIVER_NAME ": Error getting resources\n"); } @@ -263,8 +263,8 @@ static s16 lis3lv02d_read_16(acpi_handle handle, int reg) { u8 lo, hi; - adev.read(handle, reg - 1, &lo); - adev.read(handle, reg, &hi); + lis3_dev.read(handle, reg - 1, &lo); + lis3_dev.read(handle, reg, &hi); /* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */ return (s16)((hi << 8) | lo); } @@ -272,7 +272,7 @@ static s16 lis3lv02d_read_16(acpi_handle handle, int reg) static s16 lis3lv02d_read_8(acpi_handle handle, int reg) { s8 lo; - adev.read(handle, reg, &lo); + lis3_dev.read(handle, reg, &lo); return lo; } @@ -283,29 +283,29 @@ static int lis3lv02d_add(struct acpi_device *device) if (!device) return -EINVAL; - adev.device = device; - adev.init = lis3lv02d_acpi_init; - adev.read = lis3lv02d_acpi_read; - adev.write = lis3lv02d_acpi_write; + lis3_dev.device = device; + lis3_dev.init = lis3lv02d_acpi_init; + lis3_dev.read = lis3lv02d_acpi_read; + lis3_dev.write = lis3lv02d_acpi_write; strcpy(acpi_device_name(device), DRIVER_NAME); strcpy(acpi_device_class(device), ACPI_MDPS_CLASS); - device->driver_data = &adev; + device->driver_data = &lis3_dev; - lis3lv02d_acpi_read(device->handle, WHO_AM_I, &adev.whoami); - switch (adev.whoami) { + lis3lv02d_acpi_read(device->handle, WHO_AM_I, &lis3_dev.whoami); + switch (lis3_dev.whoami) { case LIS_DOUBLE_ID: printk(KERN_INFO DRIVER_NAME ": 2-byte sensor found\n"); - adev.read_data = lis3lv02d_read_16; - adev.mdps_max_val = 2048; + lis3_dev.read_data = lis3lv02d_read_16; + lis3_dev.mdps_max_val = 2048; break; case LIS_SINGLE_ID: printk(KERN_INFO DRIVER_NAME ": 1-byte sensor found\n"); - adev.read_data = lis3lv02d_read_8; - adev.mdps_max_val = 128; + lis3_dev.read_data = lis3lv02d_read_8; + lis3_dev.mdps_max_val = 128; break; default: printk(KERN_ERR DRIVER_NAME - ": unknown sensor type 0x%X\n", adev.whoami); + ": unknown sensor type 0x%X\n", lis3_dev.whoami); return -EINVAL; } @@ -313,7 +313,7 @@ static int lis3lv02d_add(struct acpi_device *device) if (dmi_check_system(lis3lv02d_dmi_ids) == 0) { printk(KERN_INFO DRIVER_NAME ": laptop model unknown, " "using default axes configuration\n"); - adev.ac = lis3lv02d_axis_normal; + lis3_dev.ac = lis3lv02d_axis_normal; } INIT_WORK(&hpled_led.work, delayed_set_status_worker); @@ -322,9 +322,9 @@ static int lis3lv02d_add(struct acpi_device *device) return ret; /* obtain IRQ number of our device from ACPI */ - lis3lv02d_enum_resources(adev.device); + lis3lv02d_enum_resources(lis3_dev.device); - ret = lis3lv02d_init_device(&adev); + ret = lis3lv02d_init_device(&lis3_dev); if (ret) { flush_work(&hpled_led.work); led_classdev_unregister(&hpled_led.led_classdev); @@ -360,12 +360,12 @@ static int lis3lv02d_suspend(struct acpi_device *device, pm_message_t state) static int lis3lv02d_resume(struct acpi_device *device) { /* put back the device in the right state (ACPI might turn it on) */ - mutex_lock(&adev.lock); - if (adev.usage > 0) + mutex_lock(&lis3_dev.lock); + if (lis3_dev.usage > 0) lis3lv02d_poweron(device->handle); else lis3lv02d_poweroff(device->handle); - mutex_unlock(&adev.lock); + mutex_unlock(&lis3_dev.lock); return 0; } #else diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c index 8bb2158f0453..4888ac5ba8cc 100644 --- a/drivers/hwmon/lis3lv02d.c +++ b/drivers/hwmon/lis3lv02d.c @@ -53,14 +53,24 @@ * joystick. */ -struct acpi_lis3lv02d adev = { - .misc_wait = __WAIT_QUEUE_HEAD_INITIALIZER(adev.misc_wait), +struct acpi_lis3lv02d lis3_dev = { + .misc_wait = __WAIT_QUEUE_HEAD_INITIALIZER(lis3_dev.misc_wait), }; -EXPORT_SYMBOL_GPL(adev); +EXPORT_SYMBOL_GPL(lis3_dev); static int lis3lv02d_add_fs(struct acpi_device *device); +static s16 lis3lv02d_read_16(acpi_handle handle, int reg) +{ + u8 lo, hi; + + lis3_dev.read(handle, reg, &lo); + lis3_dev.read(handle, reg + 1, &hi); + /* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */ + return (s16)((hi << 8) | lo); +} + /** * lis3lv02d_get_axis - For the given axis, give the value converted * @axis: 1,2,3 - can also be negative @@ -89,25 +99,25 @@ static void lis3lv02d_get_xyz(acpi_handle handle, int *x, int *y, int *z) { int position[3]; - position[0] = adev.read_data(handle, OUTX); - position[1] = adev.read_data(handle, OUTY); - position[2] = adev.read_data(handle, OUTZ); + position[0] = lis3_dev.read_data(handle, OUTX); + position[1] = lis3_dev.read_data(handle, OUTY); + position[2] = lis3_dev.read_data(handle, OUTZ); - *x = lis3lv02d_get_axis(adev.ac.x, position); - *y = lis3lv02d_get_axis(adev.ac.y, position); - *z = lis3lv02d_get_axis(adev.ac.z, position); + *x = lis3lv02d_get_axis(lis3_dev.ac.x, position); + *y = lis3lv02d_get_axis(lis3_dev.ac.y, position); + *z = lis3lv02d_get_axis(lis3_dev.ac.z, position); } void lis3lv02d_poweroff(acpi_handle handle) { - adev.is_on = 0; + lis3_dev.is_on = 0; } EXPORT_SYMBOL_GPL(lis3lv02d_poweroff); void lis3lv02d_poweron(acpi_handle handle) { - adev.is_on = 1; - adev.init(handle); + lis3_dev.is_on = 1; + lis3_dev.init(handle); } EXPORT_SYMBOL_GPL(lis3lv02d_poweron); @@ -147,10 +157,10 @@ static irqreturn_t lis302dl_interrupt(int irq, void *dummy) * the lid is closed. This leads to interrupts as soon as a little move * is done. */ - atomic_inc(&adev.count); + atomic_inc(&lis3_dev.count); - wake_up_interruptible(&adev.misc_wait); - kill_fasync(&adev.async_queue, SIGIO, POLL_IN); + wake_up_interruptible(&lis3_dev.misc_wait); + kill_fasync(&lis3_dev.async_queue, SIGIO, POLL_IN); return IRQ_HANDLED; } @@ -158,10 +168,10 @@ static int lis3lv02d_misc_open(struct inode *inode, struct file *file) { int ret; - if (test_and_set_bit(0, &adev.misc_opened)) + if (test_and_set_bit(0, &lis3_dev.misc_opened)) return -EBUSY; /* already open */ - atomic_set(&adev.count, 0); + atomic_set(&lis3_dev.count, 0); /* * The sensor can generate interrupts for free-fall and direction @@ -174,25 +184,25 @@ static int lis3lv02d_misc_open(struct inode *inode, struct file *file) * io-apic is not configurable (and generates a warning) but I keep it * in case of support for other hardware. */ - ret = request_irq(adev.irq, lis302dl_interrupt, IRQF_TRIGGER_RISING, - DRIVER_NAME, &adev); + ret = request_irq(lis3_dev.irq, lis302dl_interrupt, IRQF_TRIGGER_RISING, + DRIVER_NAME, &lis3_dev); if (ret) { - clear_bit(0, &adev.misc_opened); - printk(KERN_ERR DRIVER_NAME ": IRQ%d allocation failed\n", adev.irq); + clear_bit(0, &lis3_dev.misc_opened); + printk(KERN_ERR DRIVER_NAME ": IRQ%d allocation failed\n", lis3_dev.irq); return -EBUSY; } - lis3lv02d_increase_use(&adev); - printk("lis3: registered interrupt %d\n", adev.irq); + lis3lv02d_increase_use(&lis3_dev); + printk("lis3: registered interrupt %d\n", lis3_dev.irq); return 0; } static int lis3lv02d_misc_release(struct inode *inode, struct file *file) { - fasync_helper(-1, file, 0, &adev.async_queue); - lis3lv02d_decrease_use(&adev); - free_irq(adev.irq, &adev); - clear_bit(0, &adev.misc_opened); /* release the device */ + fasync_helper(-1, file, 0, &lis3_dev.async_queue); + lis3lv02d_decrease_use(&lis3_dev); + free_irq(lis3_dev.irq, &lis3_dev); + clear_bit(0, &lis3_dev.misc_opened); /* release the device */ return 0; } @@ -207,10 +217,10 @@ static ssize_t lis3lv02d_misc_read(struct file *file, char __user *buf, if (count < 1) return -EINVAL; - add_wait_queue(&adev.misc_wait, &wait); + add_wait_queue(&lis3_dev.misc_wait, &wait); while (true) { set_current_state(TASK_INTERRUPTIBLE); - data = atomic_xchg(&adev.count, 0); + data = atomic_xchg(&lis3_dev.count, 0); if (data) break; @@ -240,22 +250,22 @@ static ssize_t lis3lv02d_misc_read(struct file *file, char __user *buf, out: __set_current_state(TASK_RUNNING); - remove_wait_queue(&adev.misc_wait, &wait); + remove_wait_queue(&lis3_dev.misc_wait, &wait); return retval; } static unsigned int lis3lv02d_misc_poll(struct file *file, poll_table *wait) { - poll_wait(file, &adev.misc_wait, wait); - if (atomic_read(&adev.count)) + poll_wait(file, &lis3_dev.misc_wait, wait); + if (atomic_read(&lis3_dev.count)) return POLLIN | POLLRDNORM; return 0; } static int lis3lv02d_misc_fasync(int fd, struct file *file, int on) { - return fasync_helper(fd, file, on, &adev.async_queue); + return fasync_helper(fd, file, on, &lis3_dev.async_queue); } static const struct file_operations lis3lv02d_misc_fops = { @@ -283,12 +293,12 @@ static int lis3lv02d_joystick_kthread(void *data) int x, y, z; while (!kthread_should_stop()) { - lis3lv02d_get_xyz(adev.device->handle, &x, &y, &z); - input_report_abs(adev.idev, ABS_X, x - adev.xcalib); - input_report_abs(adev.idev, ABS_Y, y - adev.ycalib); - input_report_abs(adev.idev, ABS_Z, z - adev.zcalib); + lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z); + input_report_abs(lis3_dev.idev, ABS_X, x - lis3_dev.xcalib); + input_report_abs(lis3_dev.idev, ABS_Y, y - lis3_dev.ycalib); + input_report_abs(lis3_dev.idev, ABS_Z, z - lis3_dev.zcalib); - input_sync(adev.idev); + input_sync(lis3_dev.idev); try_to_freeze(); msleep_interruptible(MDPS_POLL_INTERVAL); @@ -299,11 +309,11 @@ static int lis3lv02d_joystick_kthread(void *data) static int lis3lv02d_joystick_open(struct input_dev *input) { - lis3lv02d_increase_use(&adev); - adev.kthread = kthread_run(lis3lv02d_joystick_kthread, NULL, "klis3lv02d"); - if (IS_ERR(adev.kthread)) { - lis3lv02d_decrease_use(&adev); - return PTR_ERR(adev.kthread); + lis3lv02d_increase_use(&lis3_dev); + lis3_dev.kthread = kthread_run(lis3lv02d_joystick_kthread, NULL, "klis3lv02d"); + if (IS_ERR(lis3_dev.kthread)) { + lis3lv02d_decrease_use(&lis3_dev); + return PTR_ERR(lis3_dev.kthread); } return 0; @@ -311,45 +321,45 @@ static int lis3lv02d_joystick_open(struct input_dev *input) static void lis3lv02d_joystick_close(struct input_dev *input) { - kthread_stop(adev.kthread); - lis3lv02d_decrease_use(&adev); + kthread_stop(lis3_dev.kthread); + lis3lv02d_decrease_use(&lis3_dev); } static inline void lis3lv02d_calibrate_joystick(void) { - lis3lv02d_get_xyz(adev.device->handle, &adev.xcalib, &adev.ycalib, &adev.zcalib); + lis3lv02d_get_xyz(lis3_dev.device->handle, &lis3_dev.xcalib, &lis3_dev.ycalib, &lis3_dev.zcalib); } int lis3lv02d_joystick_enable(void) { int err; - if (adev.idev) + if (lis3_dev.idev) return -EINVAL; - adev.idev = input_allocate_device(); - if (!adev.idev) + lis3_dev.idev = input_allocate_device(); + if (!lis3_dev.idev) return -ENOMEM; lis3lv02d_calibrate_joystick(); - adev.idev->name = "ST LIS3LV02DL Accelerometer"; - adev.idev->phys = DRIVER_NAME "/input0"; - adev.idev->id.bustype = BUS_HOST; - adev.idev->id.vendor = 0; - adev.idev->dev.parent = &adev.pdev->dev; - adev.idev->open = lis3lv02d_joystick_open; - adev.idev->close = lis3lv02d_joystick_close; + lis3_dev.idev->name = "ST LIS3LV02DL Accelerometer"; + lis3_dev.idev->phys = DRIVER_NAME "/input0"; + lis3_dev.idev->id.bustype = BUS_HOST; + lis3_dev.idev->id.vendor = 0; + lis3_dev.idev->dev.parent = &lis3_dev.pdev->dev; + lis3_dev.idev->open = lis3lv02d_joystick_open; + lis3_dev.idev->close = lis3lv02d_joystick_close; - set_bit(EV_ABS, adev.idev->evbit); - input_set_abs_params(adev.idev, ABS_X, -adev.mdps_max_val, adev.mdps_max_val, 3, 3); - input_set_abs_params(adev.idev, ABS_Y, -adev.mdps_max_val, adev.mdps_max_val, 3, 3); - input_set_abs_params(adev.idev, ABS_Z, -adev.mdps_max_val, adev.mdps_max_val, 3, 3); + set_bit(EV_ABS, lis3_dev.idev->evbit); + input_set_abs_params(lis3_dev.idev, ABS_X, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3); + input_set_abs_params(lis3_dev.idev, ABS_Y, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3); + input_set_abs_params(lis3_dev.idev, ABS_Z, -lis3_dev.mdps_max_val, lis3_dev.mdps_max_val, 3, 3); - err = input_register_device(adev.idev); + err = input_register_device(lis3_dev.idev); if (err) { - input_free_device(adev.idev); - adev.idev = NULL; + input_free_device(lis3_dev.idev); + lis3_dev.idev = NULL; } return err; @@ -358,12 +368,12 @@ EXPORT_SYMBOL_GPL(lis3lv02d_joystick_enable); void lis3lv02d_joystick_disable(void) { - if (!adev.idev) + if (!lis3_dev.idev) return; misc_deregister(&lis3lv02d_misc_device); - input_unregister_device(adev.idev); - adev.idev = NULL; + input_unregister_device(lis3_dev.idev); + lis3_dev.idev = NULL; } EXPORT_SYMBOL_GPL(lis3lv02d_joystick_disable); @@ -404,25 +414,25 @@ static ssize_t lis3lv02d_position_show(struct device *dev, { int x, y, z; - lis3lv02d_increase_use(&adev); - lis3lv02d_get_xyz(adev.device->handle, &x, &y, &z); - lis3lv02d_decrease_use(&adev); + lis3lv02d_increase_use(&lis3_dev); + lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z); + lis3lv02d_decrease_use(&lis3_dev); return sprintf(buf, "(%d,%d,%d)\n", x, y, z); } static ssize_t lis3lv02d_calibrate_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "(%d,%d,%d)\n", adev.xcalib, adev.ycalib, adev.zcalib); + return sprintf(buf, "(%d,%d,%d)\n", lis3_dev.xcalib, lis3_dev.ycalib, lis3_dev.zcalib); } static ssize_t lis3lv02d_calibrate_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - lis3lv02d_increase_use(&adev); + lis3lv02d_increase_use(&lis3_dev); lis3lv02d_calibrate_joystick(); - lis3lv02d_decrease_use(&adev); + lis3lv02d_decrease_use(&lis3_dev); return count; } @@ -434,9 +444,9 @@ static ssize_t lis3lv02d_rate_show(struct device *dev, u8 ctrl; int val; - lis3lv02d_increase_use(&adev); - adev.read(adev.device->handle, CTRL_REG1, &ctrl); - lis3lv02d_decrease_use(&adev); + lis3lv02d_increase_use(&lis3_dev); + lis3_dev.read(lis3_dev.device->handle, CTRL_REG1, &ctrl); + lis3lv02d_decrease_use(&lis3_dev); val = (ctrl & (CTRL1_DF0 | CTRL1_DF1)) >> 4; return sprintf(buf, "%d\n", lis3lv02dl_df_val[val]); } @@ -460,17 +470,17 @@ static struct attribute_group lis3lv02d_attribute_group = { static int lis3lv02d_add_fs(struct acpi_device *device) { - adev.pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0); - if (IS_ERR(adev.pdev)) - return PTR_ERR(adev.pdev); + lis3_dev.pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0); + if (IS_ERR(lis3_dev.pdev)) + return PTR_ERR(lis3_dev.pdev); - return sysfs_create_group(&adev.pdev->dev.kobj, &lis3lv02d_attribute_group); + return sysfs_create_group(&lis3_dev.pdev->dev.kobj, &lis3lv02d_attribute_group); } int lis3lv02d_remove_fs(void) { - sysfs_remove_group(&adev.pdev->dev.kobj, &lis3lv02d_attribute_group); - platform_device_unregister(adev.pdev); + sysfs_remove_group(&lis3_dev.pdev->dev.kobj, &lis3lv02d_attribute_group); + platform_device_unregister(lis3_dev.pdev); return 0; } EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs); diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h index 75972bf372ff..017fb2b754d5 100644 --- a/drivers/hwmon/lis3lv02d.h +++ b/drivers/hwmon/lis3lv02d.h @@ -194,4 +194,4 @@ void lis3lv02d_poweroff(acpi_handle handle); void lis3lv02d_poweron(acpi_handle handle); int lis3lv02d_remove_fs(void); -extern struct acpi_lis3lv02d adev; +extern struct acpi_lis3lv02d lis3_dev; From 061603275814544842e7df77d1157eff18565997 Mon Sep 17 00:00:00 2001 From: Davide Rizzo Date: Tue, 31 Mar 2009 15:24:27 -0700 Subject: [PATCH 380/478] hwmon: LM95241 driver An hwmon driver for the National Semiconductor LM95241 triple temperature sensors chip Signed-off-by: Davide Rizzo Cc: Jean Delvare Cc: "Mark M. Hoffman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/Kconfig | 9 + drivers/hwmon/Makefile | 1 + drivers/hwmon/lm95241.c | 527 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 537 insertions(+) create mode 100644 drivers/hwmon/lm95241.c diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 51ff9b3d7ea2..8a91c9971558 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -582,6 +582,15 @@ config SENSORS_LTC4245 This driver can also be built as a module. If so, the module will be called ltc4245. +config SENSORS_LM95241 + tristate "National Semiconductor LM95241 sensor chip" + depends on I2C + help + If you say yes here you get support for LM95241 sensor chip. + + This driver can also be built as a module. If so, the module + will be called lm95241. + config SENSORS_MAX1111 tristate "Maxim MAX1111 Multichannel, Serial 8-bit ADC chip" depends on SPI_MASTER diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index e332d6267920..81c88822a3eb 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_SENSORS_LM87) += lm87.o obj-$(CONFIG_SENSORS_LM90) += lm90.o obj-$(CONFIG_SENSORS_LM92) += lm92.o obj-$(CONFIG_SENSORS_LM93) += lm93.o +obj-$(CONFIG_SENSORS_LM95241) += lm95241.o obj-$(CONFIG_SENSORS_LTC4245) += ltc4245.o obj-$(CONFIG_SENSORS_MAX1111) += max1111.o obj-$(CONFIG_SENSORS_MAX1619) += max1619.o diff --git a/drivers/hwmon/lm95241.c b/drivers/hwmon/lm95241.c new file mode 100644 index 000000000000..091d95f38aaa --- /dev/null +++ b/drivers/hwmon/lm95241.c @@ -0,0 +1,527 @@ +/* + * lm95241.c - Part of lm_sensors, Linux kernel modules for hardware + * monitoring + * Copyright (C) 2008 Davide Rizzo + * + * Based on the max1619 driver. The LM95241 is a sensor chip made by National + * Semiconductors. + * It reports up to three temperatures (its own plus up to + * two external ones). Complete datasheet can be + * obtained from National's website at: + * http://www.national.com/ds.cgi/LM/LM95241.pdf + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned short normal_i2c[] = { + 0x19, 0x2a, 0x2b, I2C_CLIENT_END}; + +/* Insmod parameters */ +I2C_CLIENT_INSMOD_1(lm95241); + +/* LM95241 registers */ +#define LM95241_REG_R_MAN_ID 0xFE +#define LM95241_REG_R_CHIP_ID 0xFF +#define LM95241_REG_R_STATUS 0x02 +#define LM95241_REG_RW_CONFIG 0x03 +#define LM95241_REG_RW_REM_FILTER 0x06 +#define LM95241_REG_RW_TRUTHERM 0x07 +#define LM95241_REG_W_ONE_SHOT 0x0F +#define LM95241_REG_R_LOCAL_TEMPH 0x10 +#define LM95241_REG_R_REMOTE1_TEMPH 0x11 +#define LM95241_REG_R_REMOTE2_TEMPH 0x12 +#define LM95241_REG_R_LOCAL_TEMPL 0x20 +#define LM95241_REG_R_REMOTE1_TEMPL 0x21 +#define LM95241_REG_R_REMOTE2_TEMPL 0x22 +#define LM95241_REG_RW_REMOTE_MODEL 0x30 + +/* LM95241 specific bitfields */ +#define CFG_STOP 0x40 +#define CFG_CR0076 0x00 +#define CFG_CR0182 0x10 +#define CFG_CR1000 0x20 +#define CFG_CR2700 0x30 +#define R1MS_SHIFT 0 +#define R2MS_SHIFT 2 +#define R1MS_MASK (0x01 << (R1MS_SHIFT)) +#define R2MS_MASK (0x01 << (R2MS_SHIFT)) +#define R1DF_SHIFT 1 +#define R2DF_SHIFT 2 +#define R1DF_MASK (0x01 << (R1DF_SHIFT)) +#define R2DF_MASK (0x01 << (R2DF_SHIFT)) +#define R1FE_MASK 0x01 +#define R2FE_MASK 0x05 +#define TT1_SHIFT 0 +#define TT2_SHIFT 4 +#define TT_OFF 0 +#define TT_ON 1 +#define TT_MASK 7 +#define MANUFACTURER_ID 0x01 +#define DEFAULT_REVISION 0xA4 + +/* Conversions and various macros */ +#define TEMP_FROM_REG(val_h, val_l) (((val_h) & 0x80 ? (val_h) - 0x100 : \ + (val_h)) * 1000 + (val_l) * 1000 / 256) + +/* Functions declaration */ +static int lm95241_attach_adapter(struct i2c_adapter *adapter); +static int lm95241_detect(struct i2c_adapter *adapter, int address, + int kind); +static void lm95241_init_client(struct i2c_client *client); +static int lm95241_detach_client(struct i2c_client *client); +static struct lm95241_data *lm95241_update_device(struct device *dev); + +/* Driver data (common to all clients) */ +static struct i2c_driver lm95241_driver = { + .driver = { + .name = "lm95241", + }, + .attach_adapter = lm95241_attach_adapter, + .detach_client = lm95241_detach_client, +}; + +/* Client data (each client gets its own) */ +struct lm95241_data { + struct i2c_client client; + struct device *hwmon_dev; + struct mutex update_lock; + unsigned long last_updated, rate; /* in jiffies */ + char valid; /* zero until following fields are valid */ + /* registers values */ + u8 local_h, local_l; /* local */ + u8 remote1_h, remote1_l; /* remote1 */ + u8 remote2_h, remote2_l; /* remote2 */ + u8 config, model, trutherm; +}; + +/* Sysfs stuff */ +#define show_temp(value) \ +static ssize_t show_##value(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct lm95241_data *data = lm95241_update_device(dev); \ + snprintf(buf, PAGE_SIZE - 1, "%d\n", \ + TEMP_FROM_REG(data->value##_h, data->value##_l)); \ + return strlen(buf); \ +} +show_temp(local); +show_temp(remote1); +show_temp(remote2); + +static ssize_t show_rate(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct lm95241_data *data = lm95241_update_device(dev); + + snprintf(buf, PAGE_SIZE - 1, "%lu\n", 1000 * data->rate / HZ); + return strlen(buf); +} + +static ssize_t set_rate(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct lm95241_data *data = i2c_get_clientdata(client); + + strict_strtol(buf, 10, &data->rate); + data->rate = data->rate * HZ / 1000; + + return count; +} + +#define show_type(flag) \ +static ssize_t show_type##flag(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct i2c_client *client = to_i2c_client(dev); \ + struct lm95241_data *data = i2c_get_clientdata(client); \ +\ + snprintf(buf, PAGE_SIZE - 1, \ + data->model & R##flag##MS_MASK ? "1\n" : "2\n"); \ + return strlen(buf); \ +} +show_type(1); +show_type(2); + +#define show_min(flag) \ +static ssize_t show_min##flag(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct i2c_client *client = to_i2c_client(dev); \ + struct lm95241_data *data = i2c_get_clientdata(client); \ +\ + snprintf(buf, PAGE_SIZE - 1, \ + data->config & R##flag##DF_MASK ? \ + "-127000\n" : "0\n"); \ + return strlen(buf); \ +} +show_min(1); +show_min(2); + +#define show_max(flag) \ +static ssize_t show_max##flag(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct i2c_client *client = to_i2c_client(dev); \ + struct lm95241_data *data = i2c_get_clientdata(client); \ +\ + snprintf(buf, PAGE_SIZE - 1, \ + data->config & R##flag##DF_MASK ? \ + "127000\n" : "255000\n"); \ + return strlen(buf); \ +} +show_max(1); +show_max(2); + +#define set_type(flag) \ +static ssize_t set_type##flag(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct i2c_client *client = to_i2c_client(dev); \ + struct lm95241_data *data = i2c_get_clientdata(client); \ +\ + long val; \ + strict_strtol(buf, 10, &val); \ +\ + if ((val == 1) || (val == 2)) { \ +\ + mutex_lock(&data->update_lock); \ +\ + data->trutherm &= ~(TT_MASK << TT##flag##_SHIFT); \ + if (val == 1) { \ + data->model |= R##flag##MS_MASK; \ + data->trutherm |= (TT_ON << TT##flag##_SHIFT); \ + } \ + else { \ + data->model &= ~R##flag##MS_MASK; \ + data->trutherm |= (TT_OFF << TT##flag##_SHIFT); \ + } \ +\ + data->valid = 0; \ +\ + i2c_smbus_write_byte_data(client, LM95241_REG_RW_REMOTE_MODEL, \ + data->model); \ + i2c_smbus_write_byte_data(client, LM95241_REG_RW_TRUTHERM, \ + data->trutherm); \ +\ + mutex_unlock(&data->update_lock); \ +\ + } \ + return count; \ +} +set_type(1); +set_type(2); + +#define set_min(flag) \ +static ssize_t set_min##flag(struct device *dev, \ + struct device_attribute *devattr, const char *buf, size_t count) \ +{ \ + struct i2c_client *client = to_i2c_client(dev); \ + struct lm95241_data *data = i2c_get_clientdata(client); \ +\ + long val; \ + strict_strtol(buf, 10, &val); \ +\ + mutex_lock(&data->update_lock); \ +\ + if (val < 0) \ + data->config |= R##flag##DF_MASK; \ + else \ + data->config &= ~R##flag##DF_MASK; \ +\ + data->valid = 0; \ +\ + i2c_smbus_write_byte_data(client, LM95241_REG_RW_CONFIG, \ + data->config); \ +\ + mutex_unlock(&data->update_lock); \ +\ + return count; \ +} +set_min(1); +set_min(2); + +#define set_max(flag) \ +static ssize_t set_max##flag(struct device *dev, \ + struct device_attribute *devattr, const char *buf, size_t count) \ +{ \ + struct i2c_client *client = to_i2c_client(dev); \ + struct lm95241_data *data = i2c_get_clientdata(client); \ +\ + long val; \ + strict_strtol(buf, 10, &val); \ +\ + mutex_lock(&data->update_lock); \ +\ + if (val <= 127000) \ + data->config |= R##flag##DF_MASK; \ + else \ + data->config &= ~R##flag##DF_MASK; \ +\ + data->valid = 0; \ +\ + i2c_smbus_write_byte_data(client, LM95241_REG_RW_CONFIG, \ + data->config); \ +\ + mutex_unlock(&data->update_lock); \ +\ + return count; \ +} +set_max(1); +set_max(2); + +static DEVICE_ATTR(temp1_input, S_IRUGO, show_local, NULL); +static DEVICE_ATTR(temp2_input, S_IRUGO, show_remote1, NULL); +static DEVICE_ATTR(temp3_input, S_IRUGO, show_remote2, NULL); +static DEVICE_ATTR(temp2_type, S_IWUSR | S_IRUGO, show_type1, set_type1); +static DEVICE_ATTR(temp3_type, S_IWUSR | S_IRUGO, show_type2, set_type2); +static DEVICE_ATTR(temp2_min, S_IWUSR | S_IRUGO, show_min1, set_min1); +static DEVICE_ATTR(temp3_min, S_IWUSR | S_IRUGO, show_min2, set_min2); +static DEVICE_ATTR(temp2_max, S_IWUSR | S_IRUGO, show_max1, set_max1); +static DEVICE_ATTR(temp3_max, S_IWUSR | S_IRUGO, show_max2, set_max2); +static DEVICE_ATTR(rate, S_IWUSR | S_IRUGO, show_rate, set_rate); + +static struct attribute *lm95241_attributes[] = { + &dev_attr_temp1_input.attr, + &dev_attr_temp2_input.attr, + &dev_attr_temp3_input.attr, + &dev_attr_temp2_type.attr, + &dev_attr_temp3_type.attr, + &dev_attr_temp2_min.attr, + &dev_attr_temp3_min.attr, + &dev_attr_temp2_max.attr, + &dev_attr_temp3_max.attr, + &dev_attr_rate.attr, + NULL +}; + +static const struct attribute_group lm95241_group = { + .attrs = lm95241_attributes, +}; + +/* Init/exit code */ +static int lm95241_attach_adapter(struct i2c_adapter *adapter) +{ + if (!(adapter->class & I2C_CLASS_HWMON)) + return 0; + return i2c_probe(adapter, &addr_data, lm95241_detect); +} + +/* + * The following function does more than just detection. If detection + * succeeds, it also registers the new chip. + */ +static int lm95241_detect(struct i2c_adapter *adapter, int address, int kind) +{ + struct i2c_client *new_client; + struct lm95241_data *data; + int err = 0; + const char *name = ""; + + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) + goto exit; + + data = kzalloc(sizeof(struct lm95241_data), GFP_KERNEL); + if (!data) { + err = -ENOMEM; + goto exit; + } + + /* The common I2C client data is placed right before the + LM95241-specific data. */ + new_client = &data->client; + i2c_set_clientdata(new_client, data); + new_client->addr = address; + new_client->adapter = adapter; + new_client->driver = &lm95241_driver; + new_client->flags = 0; + + /* + * Now we do the remaining detection. A negative kind means that + * the driver was loaded with no force parameter (default), so we + * must both detect and identify the chip. A zero kind means that + * the driver was loaded with the force parameter, the detection + * step shall be skipped. A positive kind means that the driver + * was loaded with the force parameter and a given kind of chip is + * requested, so both the detection and the identification steps + * are skipped. + */ + if (kind < 0) { /* detection */ + if ((i2c_smbus_read_byte_data(new_client, LM95241_REG_R_MAN_ID) + != MANUFACTURER_ID) + || (i2c_smbus_read_byte_data(new_client, LM95241_REG_R_CHIP_ID) + < DEFAULT_REVISION)) { + dev_dbg(&adapter->dev, + "LM95241 detection failed at 0x%02x.\n", + address); + goto exit_free; + } + } + + if (kind <= 0) { /* identification */ + if ((i2c_smbus_read_byte_data(new_client, LM95241_REG_R_MAN_ID) + == MANUFACTURER_ID) + && (i2c_smbus_read_byte_data(new_client, LM95241_REG_R_CHIP_ID) + >= DEFAULT_REVISION)) { + + kind = lm95241; + + if (kind <= 0) { /* identification failed */ + dev_info(&adapter->dev, "Unsupported chip\n"); + goto exit_free; + } + } + } + + if (kind == lm95241) + name = "lm95241"; + + /* We can fill in the remaining client fields */ + strlcpy(new_client->name, name, I2C_NAME_SIZE); + data->valid = 0; + mutex_init(&data->update_lock); + + /* Tell the I2C layer a new client has arrived */ + err = i2c_attach_client(new_client); + if (err) + goto exit_free; + + /* Initialize the LM95241 chip */ + lm95241_init_client(new_client); + + /* Register sysfs hooks */ + err = sysfs_create_group(&new_client->dev.kobj, &lm95241_group); + if (err) + goto exit_detach; + + data->hwmon_dev = hwmon_device_register(&new_client->dev); + if (IS_ERR(data->hwmon_dev)) { + err = PTR_ERR(data->hwmon_dev); + goto exit_remove_files; + } + + return 0; + +exit_remove_files: + sysfs_remove_group(&new_client->dev.kobj, &lm95241_group); +exit_detach: + i2c_detach_client(new_client); +exit_free: + kfree(data); +exit: + return err; +} + +static void lm95241_init_client(struct i2c_client *client) +{ + struct lm95241_data *data = i2c_get_clientdata(client); + + data->rate = HZ; /* 1 sec default */ + data->valid = 0; + data->config = CFG_CR0076; + data->model = 0; + data->trutherm = (TT_OFF << TT1_SHIFT) | (TT_OFF << TT2_SHIFT); + + i2c_smbus_write_byte_data(client, LM95241_REG_RW_CONFIG, + data->config); + i2c_smbus_write_byte_data(client, LM95241_REG_RW_REM_FILTER, + R1FE_MASK | R2FE_MASK); + i2c_smbus_write_byte_data(client, LM95241_REG_RW_TRUTHERM, + data->trutherm); + i2c_smbus_write_byte_data(client, LM95241_REG_RW_REMOTE_MODEL, + data->model); +} + +static int lm95241_detach_client(struct i2c_client *client) +{ + struct lm95241_data *data = i2c_get_clientdata(client); + int err; + + hwmon_device_unregister(data->hwmon_dev); + sysfs_remove_group(&client->dev.kobj, &lm95241_group); + + err = i2c_detach_client(client); + if (err) + return err; + + kfree(data); + return 0; +} + +static struct lm95241_data *lm95241_update_device(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct lm95241_data *data = i2c_get_clientdata(client); + + mutex_lock(&data->update_lock); + + if (time_after(jiffies, data->last_updated + data->rate) || + !data->valid) { + dev_dbg(&client->dev, "Updating lm95241 data.\n"); + data->local_h = + i2c_smbus_read_byte_data(client, + LM95241_REG_R_LOCAL_TEMPH); + data->local_l = + i2c_smbus_read_byte_data(client, + LM95241_REG_R_LOCAL_TEMPL); + data->remote1_h = + i2c_smbus_read_byte_data(client, + LM95241_REG_R_REMOTE1_TEMPH); + data->remote1_l = + i2c_smbus_read_byte_data(client, + LM95241_REG_R_REMOTE1_TEMPL); + data->remote2_h = + i2c_smbus_read_byte_data(client, + LM95241_REG_R_REMOTE2_TEMPH); + data->remote2_l = + i2c_smbus_read_byte_data(client, + LM95241_REG_R_REMOTE2_TEMPL); + data->last_updated = jiffies; + data->valid = 1; + } + + mutex_unlock(&data->update_lock); + + return data; +} + +static int __init sensors_lm95241_init(void) +{ + return i2c_add_driver(&lm95241_driver); +} + +static void __exit sensors_lm95241_exit(void) +{ + i2c_del_driver(&lm95241_driver); +} + +MODULE_AUTHOR("Davide Rizzo "); +MODULE_DESCRIPTION("LM95241 sensor driver"); +MODULE_LICENSE("GPL"); + +module_init(sensors_lm95241_init); +module_exit(sensors_lm95241_exit); From 72f5de92e199f96cfcea125aefc76c138d8c553c Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Tue, 31 Mar 2009 15:24:29 -0700 Subject: [PATCH 381/478] hwmon: Add LTC4215 driver Add Linux support for the Linear Technology LTC4215 Hot Swap controller I2C monitoring interface. I have tested the driver with my board, and it appears to work fine. With the power supplies disabled, it reads 11.93V input, 1.93V output, no current and no power. With the supplies enabled, it reads 11.93V input, 11.98V output, no current, no power. I'm not drawing any current at the moment, so this is reasonable. The value in the sense register never reads anything except 0, so I expect to get zero from the current and power calculations. I didn't attempt to support changing any of the chip's settings or enabling the FET. I'm not sure even how to do that and still fit within the hwmon framework. :) Signed-off-by: Ira W. Snyder Cc: Jean Delvare Cc: "Mark M. Hoffman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/hwmon/ltc4215 | 50 +++++ drivers/hwmon/Kconfig | 11 ++ drivers/hwmon/Makefile | 1 + drivers/hwmon/ltc4215.c | 364 ++++++++++++++++++++++++++++++++++++ 4 files changed, 426 insertions(+) create mode 100644 Documentation/hwmon/ltc4215 create mode 100644 drivers/hwmon/ltc4215.c diff --git a/Documentation/hwmon/ltc4215 b/Documentation/hwmon/ltc4215 new file mode 100644 index 000000000000..2e6a21eb656c --- /dev/null +++ b/Documentation/hwmon/ltc4215 @@ -0,0 +1,50 @@ +Kernel driver ltc4215 +===================== + +Supported chips: + * Linear Technology LTC4215 + Prefix: 'ltc4215' + Addresses scanned: 0x44 + Datasheet: + http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697 + +Author: Ira W. Snyder + + +Description +----------- + +The LTC4215 controller allows a board to be safely inserted and removed +from a live backplane. + + +Usage Notes +----------- + +This driver does not probe for LTC4215 devices, due to the fact that some +of the possible addresses are unfriendly to probing. You will need to use +the "force" parameter to tell the driver where to find the device. + +Example: the following will load the driver for an LTC4215 at address 0x44 +on I2C bus #0: +$ modprobe ltc4215 force=0,0x44 + + +Sysfs entries +------------- + +The LTC4215 has built-in limits for overvoltage, undervoltage, and +undercurrent warnings. This makes it very likely that the reference +circuit will be used. + +in1_input input voltage +in2_input output voltage + +in1_min_alarm input undervoltage alarm +in1_max_alarm input overvoltage alarm + +curr1_input current +curr1_max_alarm overcurrent alarm + +power1_input power usage +power1_alarm power bad alarm diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 8a91c9971558..bc6810c22dd7 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -571,6 +571,17 @@ config SENSORS_LM93 This driver can also be built as a module. If so, the module will be called lm93. +config SENSORS_LTC4215 + tristate "Linear Technology LTC4215" + depends on I2C && EXPERIMENTAL + default n + help + If you say yes here you get support for Linear Technology LTC4215 + Hot Swap Controller I2C interface. + + This driver can also be built as a module. If so, the module will + be called ltc4215. + config SENSORS_LTC4245 tristate "Linear Technology LTC4245" depends on I2C && EXPERIMENTAL diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index 81c88822a3eb..4da261d16abf 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_SENSORS_LM90) += lm90.o obj-$(CONFIG_SENSORS_LM92) += lm92.o obj-$(CONFIG_SENSORS_LM93) += lm93.o obj-$(CONFIG_SENSORS_LM95241) += lm95241.o +obj-$(CONFIG_SENSORS_LTC4215) += ltc4215.o obj-$(CONFIG_SENSORS_LTC4245) += ltc4245.o obj-$(CONFIG_SENSORS_MAX1111) += max1111.o obj-$(CONFIG_SENSORS_MAX1619) += max1619.o diff --git a/drivers/hwmon/ltc4215.c b/drivers/hwmon/ltc4215.c new file mode 100644 index 000000000000..9386e2a39211 --- /dev/null +++ b/drivers/hwmon/ltc4215.c @@ -0,0 +1,364 @@ +/* + * Driver for Linear Technology LTC4215 I2C Hot Swap Controller + * + * Copyright (C) 2009 Ira W. Snyder + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * Datasheet: + * http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned short normal_i2c[] = { I2C_CLIENT_END }; + +/* Insmod parameters */ +I2C_CLIENT_INSMOD_1(ltc4215); + +/* Here are names of the chip's registers (a.k.a. commands) */ +enum ltc4215_cmd { + LTC4215_CONTROL = 0x00, /* rw */ + LTC4215_ALERT = 0x01, /* rw */ + LTC4215_STATUS = 0x02, /* ro */ + LTC4215_FAULT = 0x03, /* rw */ + LTC4215_SENSE = 0x04, /* rw */ + LTC4215_SOURCE = 0x05, /* rw */ + LTC4215_ADIN = 0x06, /* rw */ +}; + +struct ltc4215_data { + struct device *hwmon_dev; + + struct mutex update_lock; + bool valid; + unsigned long last_updated; /* in jiffies */ + + /* Registers */ + u8 regs[7]; +}; + +static struct ltc4215_data *ltc4215_update_device(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ltc4215_data *data = i2c_get_clientdata(client); + s32 val; + int i; + + mutex_lock(&data->update_lock); + + /* The chip's A/D updates 10 times per second */ + if (time_after(jiffies, data->last_updated + HZ / 10) || !data->valid) { + + dev_dbg(&client->dev, "Starting ltc4215 update\n"); + + /* Read all registers */ + for (i = 0; i < ARRAY_SIZE(data->regs); i++) { + val = i2c_smbus_read_byte_data(client, i); + if (unlikely(val < 0)) + data->regs[i] = 0; + else + data->regs[i] = val; + } + + data->last_updated = jiffies; + data->valid = 1; + } + + mutex_unlock(&data->update_lock); + + return data; +} + +/* Return the voltage from the given register in millivolts */ +static int ltc4215_get_voltage(struct device *dev, u8 reg) +{ + struct ltc4215_data *data = ltc4215_update_device(dev); + const u8 regval = data->regs[reg]; + u32 voltage = 0; + + switch (reg) { + case LTC4215_SENSE: + /* 151 uV per increment */ + voltage = regval * 151 / 1000; + break; + case LTC4215_SOURCE: + /* 60.5 mV per increment */ + voltage = regval * 605 / 10; + break; + case LTC4215_ADIN: + /* The ADIN input is divided by 12.5, and has 4.82 mV + * per increment, so we have the additional multiply */ + voltage = regval * 482 * 125 / 1000; + break; + default: + /* If we get here, the developer messed up */ + WARN_ON_ONCE(1); + break; + } + + return voltage; +} + +/* Return the current from the sense resistor in mA */ +static unsigned int ltc4215_get_current(struct device *dev) +{ + struct ltc4215_data *data = ltc4215_update_device(dev); + + /* The strange looking conversions that follow are fixed-point + * math, since we cannot do floating point in the kernel. + * + * Step 1: convert sense register to microVolts + * Step 2: convert voltage to milliAmperes + * + * If you play around with the V=IR equation, you come up with + * the following: X uV / Y mOhm == Z mA + * + * With the resistors that are fractions of a milliOhm, we multiply + * the voltage and resistance by 10, to shift the decimal point. + * Now we can use the normal division operator again. + */ + + /* Calculate voltage in microVolts (151 uV per increment) */ + const unsigned int voltage = data->regs[LTC4215_SENSE] * 151; + + /* Calculate current in milliAmperes (4 milliOhm sense resistor) */ + const unsigned int curr = voltage / 4; + + return curr; +} + +static ssize_t ltc4215_show_voltage(struct device *dev, + struct device_attribute *da, + char *buf) +{ + struct sensor_device_attribute *attr = to_sensor_dev_attr(da); + const int voltage = ltc4215_get_voltage(dev, attr->index); + + return snprintf(buf, PAGE_SIZE, "%d\n", voltage); +} + +static ssize_t ltc4215_show_current(struct device *dev, + struct device_attribute *da, + char *buf) +{ + const unsigned int curr = ltc4215_get_current(dev); + + return snprintf(buf, PAGE_SIZE, "%u\n", curr); +} + +static ssize_t ltc4215_show_power(struct device *dev, + struct device_attribute *da, + char *buf) +{ + const unsigned int curr = ltc4215_get_current(dev); + const int output_voltage = ltc4215_get_voltage(dev, LTC4215_ADIN); + + /* current in mA * voltage in mV == power in uW */ + const unsigned int power = abs(output_voltage * curr); + + return snprintf(buf, PAGE_SIZE, "%u\n", power); +} + +static ssize_t ltc4215_show_alarm(struct device *dev, + struct device_attribute *da, + char *buf) +{ + struct sensor_device_attribute_2 *attr = to_sensor_dev_attr_2(da); + struct ltc4215_data *data = ltc4215_update_device(dev); + const u8 reg = data->regs[attr->index]; + const u32 mask = attr->nr; + + return snprintf(buf, PAGE_SIZE, "%u\n", (reg & mask) ? 1 : 0); +} + +/* These macros are used below in constructing device attribute objects + * for use with sysfs_create_group() to make a sysfs device file + * for each register. + */ + +#define LTC4215_VOLTAGE(name, ltc4215_cmd_idx) \ + static SENSOR_DEVICE_ATTR(name, S_IRUGO, \ + ltc4215_show_voltage, NULL, ltc4215_cmd_idx) + +#define LTC4215_CURRENT(name) \ + static SENSOR_DEVICE_ATTR(name, S_IRUGO, \ + ltc4215_show_current, NULL, 0); + +#define LTC4215_POWER(name) \ + static SENSOR_DEVICE_ATTR(name, S_IRUGO, \ + ltc4215_show_power, NULL, 0); + +#define LTC4215_ALARM(name, mask, reg) \ + static SENSOR_DEVICE_ATTR_2(name, S_IRUGO, \ + ltc4215_show_alarm, NULL, (mask), reg) + +/* Construct a sensor_device_attribute structure for each register */ + +/* Current */ +LTC4215_CURRENT(curr1_input); +LTC4215_ALARM(curr1_max_alarm, (1 << 2), LTC4215_STATUS); + +/* Power (virtual) */ +LTC4215_POWER(power1_input); +LTC4215_ALARM(power1_alarm, (1 << 3), LTC4215_STATUS); + +/* Input Voltage */ +LTC4215_VOLTAGE(in1_input, LTC4215_ADIN); +LTC4215_ALARM(in1_max_alarm, (1 << 0), LTC4215_STATUS); +LTC4215_ALARM(in1_min_alarm, (1 << 1), LTC4215_STATUS); + +/* Output Voltage */ +LTC4215_VOLTAGE(in2_input, LTC4215_SOURCE); + +/* Finally, construct an array of pointers to members of the above objects, + * as required for sysfs_create_group() + */ +static struct attribute *ltc4215_attributes[] = { + &sensor_dev_attr_curr1_input.dev_attr.attr, + &sensor_dev_attr_curr1_max_alarm.dev_attr.attr, + + &sensor_dev_attr_power1_input.dev_attr.attr, + &sensor_dev_attr_power1_alarm.dev_attr.attr, + + &sensor_dev_attr_in1_input.dev_attr.attr, + &sensor_dev_attr_in1_max_alarm.dev_attr.attr, + &sensor_dev_attr_in1_min_alarm.dev_attr.attr, + + &sensor_dev_attr_in2_input.dev_attr.attr, + + NULL, +}; + +static const struct attribute_group ltc4215_group = { + .attrs = ltc4215_attributes, +}; + +static int ltc4215_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct ltc4215_data *data; + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto out_kzalloc; + } + + i2c_set_clientdata(client, data); + mutex_init(&data->update_lock); + + /* Initialize the LTC4215 chip */ + /* TODO */ + + /* Register sysfs hooks */ + ret = sysfs_create_group(&client->dev.kobj, <c4215_group); + if (ret) + goto out_sysfs_create_group; + + data->hwmon_dev = hwmon_device_register(&client->dev); + if (IS_ERR(data->hwmon_dev)) { + ret = PTR_ERR(data->hwmon_dev); + goto out_hwmon_device_register; + } + + return 0; + +out_hwmon_device_register: + sysfs_remove_group(&client->dev.kobj, <c4215_group); +out_sysfs_create_group: + kfree(data); +out_kzalloc: + return ret; +} + +static int ltc4215_remove(struct i2c_client *client) +{ + struct ltc4215_data *data = i2c_get_clientdata(client); + + hwmon_device_unregister(data->hwmon_dev); + sysfs_remove_group(&client->dev.kobj, <c4215_group); + + kfree(data); + + return 0; +} + +static int ltc4215_detect(struct i2c_client *client, + int kind, + struct i2c_board_info *info) +{ + struct i2c_adapter *adapter = client->adapter; + + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) + return -ENODEV; + + if (kind < 0) { /* probed detection - check the chip type */ + s32 v; /* 8 bits from the chip, or -ERRNO */ + + /* + * Register 0x01 bit b7 is reserved, expect 0 + * Register 0x03 bit b6 and b7 are reserved, expect 0 + */ + v = i2c_smbus_read_byte_data(client, LTC4215_ALERT); + if (v < 0 || (v & (1 << 7)) != 0) + return -ENODEV; + + v = i2c_smbus_read_byte_data(client, LTC4215_FAULT); + if (v < 0 || (v & ((1 << 6) | (1 << 7))) != 0) + return -ENODEV; + } + + strlcpy(info->type, "ltc4215", I2C_NAME_SIZE); + dev_info(&adapter->dev, "ltc4215 %s at address 0x%02x\n", + kind < 0 ? "probed" : "forced", + client->addr); + + return 0; +} + +static const struct i2c_device_id ltc4215_id[] = { + { "ltc4215", ltc4215 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, ltc4215_id); + +/* This is the driver that will be inserted */ +static struct i2c_driver ltc4215_driver = { + .class = I2C_CLASS_HWMON, + .driver = { + .name = "ltc4215", + }, + .probe = ltc4215_probe, + .remove = ltc4215_remove, + .id_table = ltc4215_id, + .detect = ltc4215_detect, + .address_data = &addr_data, +}; + +static int __init ltc4215_init(void) +{ + return i2c_add_driver(<c4215_driver); +} + +static void __exit ltc4215_exit(void) +{ + i2c_del_driver(<c4215_driver); +} + +MODULE_AUTHOR("Ira W. Snyder "); +MODULE_DESCRIPTION("LTC4215 driver"); +MODULE_LICENSE("GPL"); + +module_init(ltc4215_init); +module_exit(ltc4215_exit); From 9d7639d33a348d5637daa50bba27497e9f5dac64 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 31 Mar 2009 15:24:29 -0700 Subject: [PATCH 382/478] hp_accel: add two more axis information Add two more laptops to whitelist. Signed-off-by: Michal Marek Signed-off-by: Pavel Machek Cc: Daniel Mack Cc: Eric Piel Cc: Vladimir Botka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/hp_accel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c index ee1ed4270df7..448a462ad789 100644 --- a/drivers/hwmon/hp_accel.c +++ b/drivers/hwmon/hp_accel.c @@ -187,6 +187,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = { AXIS_DMI_MATCH("NC2510", "HP Compaq 2510", y_inverted), AXIS_DMI_MATCH("NC8510", "HP Compaq 8510", xy_swap_inverted), AXIS_DMI_MATCH("HP2133", "HP 2133", xy_rotated_left), + AXIS_DMI_MATCH("HP2140", "HP 2140", xy_swap_inverted), AXIS_DMI_MATCH("NC653x", "HP Compaq 653", xy_rotated_left_usd), AXIS_DMI_MATCH("NC673x", "HP Compaq 673", xy_rotated_left_usd), AXIS_DMI_MATCH("NC651xx", "HP Compaq 651", xy_rotated_right), @@ -201,6 +202,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = { PRODUCT_NAME, "HP Pavilion dv5", BOARD_NAME, "3600", y_inverted), + AXIS_DMI_MATCH("DV7", "HP Pavilion dv7", x_inverted), { NULL, } /* Laptop models without axis info (yet): * "NC6910" "HP Compaq 6910" From 12a324b6a3758f04a952b99d75a625732d767585 Mon Sep 17 00:00:00 2001 From: Luca Cappa Date: Tue, 31 Mar 2009 15:24:31 -0700 Subject: [PATCH 383/478] hp_accel: axis conversion for hp compaq 8710w I have a laptop HP Compaq 8710W, I compiled into my kernel the LIS3LV02DL and HP_ACCEL module drivers. While loading it cannot recognize the laptop model, so i am sending the necessary information to update the database of axis orientations. >When the laptop is horizontal the position reported is about 0 for X and Y >and a positive value for Z Yes, it is about 0,0,1000, the actual reading says: (-17,-26,1018); > If the left side is elevated, X increases (becomes positive) Yes, X goes toward to positive 1000. >If the front side (where the touchpad is) is elevated, Y decreases (becomes negative) No, Y goes toward to positive 1000. >If the laptop is put upside-down, Z becomes negative Yes, the laptop on a table Z gives 1000, and if upsidedown the Z reads -1000. So in few words the Y axis is inverted. Cc: Eric Piel Signed-off-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/hp_accel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c index 448a462ad789..de26b1c3a96c 100644 --- a/drivers/hwmon/hp_accel.c +++ b/drivers/hwmon/hp_accel.c @@ -203,6 +203,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = { BOARD_NAME, "3600", y_inverted), AXIS_DMI_MATCH("DV7", "HP Pavilion dv7", x_inverted), + AXIS_DMI_MATCH("HP8710", "HP Compaq 8710", y_inverted), { NULL, } /* Laptop models without axis info (yet): * "NC6910" "HP Compaq 6910" From ab337a632783c251a3c3852aec0ead8a0281cbdd Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 31 Mar 2009 15:24:31 -0700 Subject: [PATCH 384/478] lis3: reorder functions to make forward decl obsolete Move lis3lv02d_init_device() down so that the forward declaration of lis3lv02d_add_fs() becomes unnecessary. Signed-off-by: Daniel Mack Acked-by: Pavel Machek Acked-by: Eric Piel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/lis3lv02d.c | 64 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c index 4888ac5ba8cc..eeae7c9600e4 100644 --- a/drivers/hwmon/lis3lv02d.c +++ b/drivers/hwmon/lis3lv02d.c @@ -59,8 +59,6 @@ struct acpi_lis3lv02d lis3_dev = { EXPORT_SYMBOL_GPL(lis3_dev); -static int lis3lv02d_add_fs(struct acpi_device *device); - static s16 lis3lv02d_read_16(acpi_handle handle, int reg) { u8 lo, hi; @@ -377,37 +375,6 @@ void lis3lv02d_joystick_disable(void) } EXPORT_SYMBOL_GPL(lis3lv02d_joystick_disable); -/* - * Initialise the accelerometer and the various subsystems. - * Should be rather independant of the bus system. - */ -int lis3lv02d_init_device(struct acpi_lis3lv02d *dev) -{ - mutex_init(&dev->lock); - lis3lv02d_add_fs(dev->device); - lis3lv02d_increase_use(dev); - - if (lis3lv02d_joystick_enable()) - printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n"); - - printk("lis3_init_device: irq %d\n", dev->irq); - - /* if we did not get an IRQ from ACPI - we have nothing more to do */ - if (!dev->irq) { - printk(KERN_ERR DRIVER_NAME - ": No IRQ in ACPI. Disabling /dev/freefall\n"); - goto out; - } - - printk("lis3: registering device\n"); - if (misc_register(&lis3lv02d_misc_device)) - printk(KERN_ERR DRIVER_NAME ": misc_register failed\n"); -out: - lis3lv02d_decrease_use(dev); - return 0; -} -EXPORT_SYMBOL_GPL(lis3lv02d_init_device); - /* Sysfs stuff */ static ssize_t lis3lv02d_position_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -485,6 +452,37 @@ int lis3lv02d_remove_fs(void) } EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs); +/* + * Initialise the accelerometer and the various subsystems. + * Should be rather independant of the bus system. + */ +int lis3lv02d_init_device(struct acpi_lis3lv02d *dev) +{ + mutex_init(&dev->lock); + lis3lv02d_add_fs(dev->device); + lis3lv02d_increase_use(dev); + + if (lis3lv02d_joystick_enable()) + printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n"); + + printk("lis3_init_device: irq %d\n", dev->irq); + + /* if we did not get an IRQ from ACPI - we have nothing more to do */ + if (!dev->irq) { + printk(KERN_ERR DRIVER_NAME + ": No IRQ in ACPI. Disabling /dev/freefall\n"); + goto out; + } + + printk("lis3: registering device\n"); + if (misc_register(&lis3lv02d_misc_device)) + printk(KERN_ERR DRIVER_NAME ": misc_register failed\n"); +out: + lis3lv02d_decrease_use(dev); + return 0; +} +EXPORT_SYMBOL_GPL(lis3lv02d_init_device); + MODULE_DESCRIPTION("ST LIS3LV02Dx three-axis digital accelerometer driver"); MODULE_AUTHOR("Yan Burman, Eric Piel, Pavel Machek"); MODULE_LICENSE("GPL"); From a38da2ed74f628c1f3e907c772be21a66eccab9c Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 31 Mar 2009 15:24:32 -0700 Subject: [PATCH 385/478] lis3: solve dependency between core and ACPI This solves the dependency between lis3lv02d.[ch] and ACPI specific methods. It introduces a ->bus_priv pointer to the device struct which is casted to 'struct acpi_device' in the ACIP layer. Changed hp_accel.c accordingly. Signed-off-by: Daniel Mack Acked-by: Pavel Machek Acked-by: Eric Piel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/hp_accel.c | 99 +++++++++++++++------------------------ drivers/hwmon/lis3lv02d.c | 86 ++++++++++++++++++++++------------ drivers/hwmon/lis3lv02d.h | 20 ++++---- 3 files changed, 105 insertions(+), 100 deletions(-) diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c index de26b1c3a96c..55d3dc565be6 100644 --- a/drivers/hwmon/hp_accel.c +++ b/drivers/hwmon/hp_accel.c @@ -85,25 +85,31 @@ MODULE_DEVICE_TABLE(acpi, lis3lv02d_device_ids); /** * lis3lv02d_acpi_init - ACPI _INI method: initialize the device. - * @handle: the handle of the device + * @lis3: pointer to the device struct * - * Returns AE_OK on success. + * Returns 0 on success. */ -acpi_status lis3lv02d_acpi_init(acpi_handle handle) +int lis3lv02d_acpi_init(struct lis3lv02d *lis3) { - return acpi_evaluate_object(handle, METHOD_NAME__INI, NULL, NULL); + struct acpi_device *dev = lis3->bus_priv; + if (acpi_evaluate_object(dev->handle, METHOD_NAME__INI, + NULL, NULL) != AE_OK) + return -EINVAL; + + return 0; } /** * lis3lv02d_acpi_read - ACPI ALRD method: read a register - * @handle: the handle of the device + * @lis3: pointer to the device struct * @reg: the register to read * @ret: result of the operation * - * Returns AE_OK on success. + * Returns 0 on success. */ -acpi_status lis3lv02d_acpi_read(acpi_handle handle, int reg, u8 *ret) +int lis3lv02d_acpi_read(struct lis3lv02d *lis3, int reg, u8 *ret) { + struct acpi_device *dev = lis3->bus_priv; union acpi_object arg0 = { ACPI_TYPE_INTEGER }; struct acpi_object_list args = { 1, &arg0 }; unsigned long long lret; @@ -111,21 +117,22 @@ acpi_status lis3lv02d_acpi_read(acpi_handle handle, int reg, u8 *ret) arg0.integer.value = reg; - status = acpi_evaluate_integer(handle, "ALRD", &args, &lret); + status = acpi_evaluate_integer(dev->handle, "ALRD", &args, &lret); *ret = lret; - return status; + return (status != AE_OK) ? -EINVAL : 0; } /** * lis3lv02d_acpi_write - ACPI ALWR method: write to a register - * @handle: the handle of the device + * @lis3: pointer to the device struct * @reg: the register to write to * @val: the value to write * - * Returns AE_OK on success. + * Returns 0 on success. */ -acpi_status lis3lv02d_acpi_write(acpi_handle handle, int reg, u8 val) +int lis3lv02d_acpi_write(struct lis3lv02d *lis3, int reg, u8 val) { + struct acpi_device *dev = lis3->bus_priv; unsigned long long ret; /* Not used when writting */ union acpi_object in_obj[2]; struct acpi_object_list args = { 2, in_obj }; @@ -135,7 +142,10 @@ acpi_status lis3lv02d_acpi_write(acpi_handle handle, int reg, u8 val) in_obj[1].type = ACPI_TYPE_INTEGER; in_obj[1].integer.value = val; - return acpi_evaluate_integer(handle, "ALWR", &args, &ret); + if (acpi_evaluate_integer(dev->handle, "ALWR", &args, &ret) != AE_OK) + return -EINVAL; + + return 0; } static int lis3lv02d_dmi_matched(const struct dmi_system_id *dmi) @@ -217,7 +227,7 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = { static void hpled_set(struct delayed_led_classdev *led_cdev, enum led_brightness value) { - acpi_handle handle = lis3_dev.device->handle; + struct acpi_device *dev = lis3_dev.bus_priv; unsigned long long ret; /* Not used when writing */ union acpi_object in_obj[1]; struct acpi_object_list args = { 1, in_obj }; @@ -225,7 +235,7 @@ static void hpled_set(struct delayed_led_classdev *led_cdev, enum led_brightness in_obj[0].type = ACPI_TYPE_INTEGER; in_obj[0].integer.value = !!value; - acpi_evaluate_integer(handle, "ALED", &args, &ret); + acpi_evaluate_integer(dev->handle, "ALED", &args, &ret); } static struct delayed_led_classdev hpled_led = { @@ -262,23 +272,6 @@ static void lis3lv02d_enum_resources(struct acpi_device *device) printk(KERN_DEBUG DRIVER_NAME ": Error getting resources\n"); } -static s16 lis3lv02d_read_16(acpi_handle handle, int reg) -{ - u8 lo, hi; - - lis3_dev.read(handle, reg - 1, &lo); - lis3_dev.read(handle, reg, &hi); - /* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */ - return (s16)((hi << 8) | lo); -} - -static s16 lis3lv02d_read_8(acpi_handle handle, int reg) -{ - s8 lo; - lis3_dev.read(handle, reg, &lo); - return lo; -} - static int lis3lv02d_add(struct acpi_device *device) { int ret; @@ -286,7 +279,7 @@ static int lis3lv02d_add(struct acpi_device *device) if (!device) return -EINVAL; - lis3_dev.device = device; + lis3_dev.bus_priv = device; lis3_dev.init = lis3lv02d_acpi_init; lis3_dev.read = lis3lv02d_acpi_read; lis3_dev.write = lis3lv02d_acpi_write; @@ -294,23 +287,8 @@ static int lis3lv02d_add(struct acpi_device *device) strcpy(acpi_device_class(device), ACPI_MDPS_CLASS); device->driver_data = &lis3_dev; - lis3lv02d_acpi_read(device->handle, WHO_AM_I, &lis3_dev.whoami); - switch (lis3_dev.whoami) { - case LIS_DOUBLE_ID: - printk(KERN_INFO DRIVER_NAME ": 2-byte sensor found\n"); - lis3_dev.read_data = lis3lv02d_read_16; - lis3_dev.mdps_max_val = 2048; - break; - case LIS_SINGLE_ID: - printk(KERN_INFO DRIVER_NAME ": 1-byte sensor found\n"); - lis3_dev.read_data = lis3lv02d_read_8; - lis3_dev.mdps_max_val = 128; - break; - default: - printk(KERN_ERR DRIVER_NAME - ": unknown sensor type 0x%X\n", lis3_dev.whoami); - return -EINVAL; - } + /* obtain IRQ number of our device from ACPI */ + lis3lv02d_enum_resources(device); /* If possible use a "standard" axes order */ if (dmi_check_system(lis3lv02d_dmi_ids) == 0) { @@ -319,18 +297,17 @@ static int lis3lv02d_add(struct acpi_device *device) lis3_dev.ac = lis3lv02d_axis_normal; } - INIT_WORK(&hpled_led.work, delayed_set_status_worker); - ret = led_classdev_register(NULL, &hpled_led.led_classdev); + /* call the core layer do its init */ + ret = lis3lv02d_init_device(&lis3_dev); if (ret) return ret; - /* obtain IRQ number of our device from ACPI */ - lis3lv02d_enum_resources(lis3_dev.device); - - ret = lis3lv02d_init_device(&lis3_dev); + INIT_WORK(&hpled_led.work, delayed_set_status_worker); + ret = led_classdev_register(NULL, &hpled_led.led_classdev); if (ret) { + lis3lv02d_joystick_disable(); + lis3lv02d_poweroff(&lis3_dev); flush_work(&hpled_led.work); - led_classdev_unregister(&hpled_led.led_classdev); return ret; } @@ -343,7 +320,7 @@ static int lis3lv02d_remove(struct acpi_device *device, int type) return -EINVAL; lis3lv02d_joystick_disable(); - lis3lv02d_poweroff(device->handle); + lis3lv02d_poweroff(&lis3_dev); flush_work(&hpled_led.work); led_classdev_unregister(&hpled_led.led_classdev); @@ -356,7 +333,7 @@ static int lis3lv02d_remove(struct acpi_device *device, int type) static int lis3lv02d_suspend(struct acpi_device *device, pm_message_t state) { /* make sure the device is off when we suspend */ - lis3lv02d_poweroff(device->handle); + lis3lv02d_poweroff(&lis3_dev); return 0; } @@ -365,9 +342,9 @@ static int lis3lv02d_resume(struct acpi_device *device) /* put back the device in the right state (ACPI might turn it on) */ mutex_lock(&lis3_dev.lock); if (lis3_dev.usage > 0) - lis3lv02d_poweron(device->handle); + lis3lv02d_poweron(&lis3_dev); else - lis3lv02d_poweroff(device->handle); + lis3lv02d_poweroff(&lis3_dev); mutex_unlock(&lis3_dev.lock); return 0; } diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c index eeae7c9600e4..778eb7795983 100644 --- a/drivers/hwmon/lis3lv02d.c +++ b/drivers/hwmon/lis3lv02d.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include "lis3lv02d.h" @@ -53,18 +52,27 @@ * joystick. */ -struct acpi_lis3lv02d lis3_dev = { +struct lis3lv02d lis3_dev = { .misc_wait = __WAIT_QUEUE_HEAD_INITIALIZER(lis3_dev.misc_wait), }; EXPORT_SYMBOL_GPL(lis3_dev); -static s16 lis3lv02d_read_16(acpi_handle handle, int reg) +static s16 lis3lv02d_read_8(struct lis3lv02d *lis3, int reg) +{ + s8 lo; + if (lis3->read(lis3, reg, &lo) < 0) + return 0; + + return lo; +} + +static s16 lis3lv02d_read_16(struct lis3lv02d *lis3, int reg) { u8 lo, hi; - lis3_dev.read(handle, reg, &lo); - lis3_dev.read(handle, reg + 1, &hi); + lis3->read(lis3, reg - 1, &lo); + lis3->read(lis3, reg, &hi); /* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */ return (s16)((hi << 8) | lo); } @@ -86,36 +94,36 @@ static inline int lis3lv02d_get_axis(s8 axis, int hw_values[3]) /** * lis3lv02d_get_xyz - Get X, Y and Z axis values from the accelerometer - * @handle: the handle to the device - * @x: where to store the X axis value - * @y: where to store the Y axis value - * @z: where to store the Z axis value + * @lis3: pointer to the device struct + * @x: where to store the X axis value + * @y: where to store the Y axis value + * @z: where to store the Z axis value * * Note that 40Hz input device can eat up about 10% CPU at 800MHZ */ -static void lis3lv02d_get_xyz(acpi_handle handle, int *x, int *y, int *z) +static void lis3lv02d_get_xyz(struct lis3lv02d *lis3, int *x, int *y, int *z) { int position[3]; - position[0] = lis3_dev.read_data(handle, OUTX); - position[1] = lis3_dev.read_data(handle, OUTY); - position[2] = lis3_dev.read_data(handle, OUTZ); + position[0] = lis3_dev.read_data(lis3, OUTX); + position[1] = lis3_dev.read_data(lis3, OUTY); + position[2] = lis3_dev.read_data(lis3, OUTZ); *x = lis3lv02d_get_axis(lis3_dev.ac.x, position); *y = lis3lv02d_get_axis(lis3_dev.ac.y, position); *z = lis3lv02d_get_axis(lis3_dev.ac.z, position); } -void lis3lv02d_poweroff(acpi_handle handle) +void lis3lv02d_poweroff(struct lis3lv02d *lis3) { lis3_dev.is_on = 0; } EXPORT_SYMBOL_GPL(lis3lv02d_poweroff); -void lis3lv02d_poweron(acpi_handle handle) +void lis3lv02d_poweron(struct lis3lv02d *lis3) { lis3_dev.is_on = 1; - lis3_dev.init(handle); + lis3_dev.init(lis3); } EXPORT_SYMBOL_GPL(lis3lv02d_poweron); @@ -124,13 +132,13 @@ EXPORT_SYMBOL_GPL(lis3lv02d_poweron); * device will always be on until a call to lis3lv02d_decrease_use(). Not to be * used from interrupt context. */ -static void lis3lv02d_increase_use(struct acpi_lis3lv02d *dev) +static void lis3lv02d_increase_use(struct lis3lv02d *dev) { mutex_lock(&dev->lock); dev->usage++; if (dev->usage == 1) { if (!dev->is_on) - lis3lv02d_poweron(dev->device->handle); + lis3lv02d_poweron(dev); } mutex_unlock(&dev->lock); } @@ -139,12 +147,12 @@ static void lis3lv02d_increase_use(struct acpi_lis3lv02d *dev) * To be called whenever a usage of the device is stopped. * It will make sure to turn off the device when there is not usage. */ -static void lis3lv02d_decrease_use(struct acpi_lis3lv02d *dev) +static void lis3lv02d_decrease_use(struct lis3lv02d *dev) { mutex_lock(&dev->lock); dev->usage--; if (dev->usage == 0) - lis3lv02d_poweroff(dev->device->handle); + lis3lv02d_poweroff(dev); mutex_unlock(&dev->lock); } @@ -291,7 +299,7 @@ static int lis3lv02d_joystick_kthread(void *data) int x, y, z; while (!kthread_should_stop()) { - lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z); + lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z); input_report_abs(lis3_dev.idev, ABS_X, x - lis3_dev.xcalib); input_report_abs(lis3_dev.idev, ABS_Y, y - lis3_dev.ycalib); input_report_abs(lis3_dev.idev, ABS_Z, z - lis3_dev.zcalib); @@ -325,7 +333,8 @@ static void lis3lv02d_joystick_close(struct input_dev *input) static inline void lis3lv02d_calibrate_joystick(void) { - lis3lv02d_get_xyz(lis3_dev.device->handle, &lis3_dev.xcalib, &lis3_dev.ycalib, &lis3_dev.zcalib); + lis3lv02d_get_xyz(&lis3_dev, + &lis3_dev.xcalib, &lis3_dev.ycalib, &lis3_dev.zcalib); } int lis3lv02d_joystick_enable(void) @@ -382,7 +391,7 @@ static ssize_t lis3lv02d_position_show(struct device *dev, int x, y, z; lis3lv02d_increase_use(&lis3_dev); - lis3lv02d_get_xyz(lis3_dev.device->handle, &x, &y, &z); + lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z); lis3lv02d_decrease_use(&lis3_dev); return sprintf(buf, "(%d,%d,%d)\n", x, y, z); } @@ -412,7 +421,7 @@ static ssize_t lis3lv02d_rate_show(struct device *dev, int val; lis3lv02d_increase_use(&lis3_dev); - lis3_dev.read(lis3_dev.device->handle, CTRL_REG1, &ctrl); + lis3_dev.read(&lis3_dev, CTRL_REG1, &ctrl); lis3lv02d_decrease_use(&lis3_dev); val = (ctrl & (CTRL1_DF0 | CTRL1_DF1)) >> 4; return sprintf(buf, "%d\n", lis3lv02dl_df_val[val]); @@ -435,7 +444,7 @@ static struct attribute_group lis3lv02d_attribute_group = { }; -static int lis3lv02d_add_fs(struct acpi_device *device) +static int lis3lv02d_add_fs(struct lis3lv02d *lis3) { lis3_dev.pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0); if (IS_ERR(lis3_dev.pdev)) @@ -456,10 +465,29 @@ EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs); * Initialise the accelerometer and the various subsystems. * Should be rather independant of the bus system. */ -int lis3lv02d_init_device(struct acpi_lis3lv02d *dev) +int lis3lv02d_init_device(struct lis3lv02d *dev) { + dev->whoami = lis3lv02d_read_8(dev, WHO_AM_I); + + switch (dev->whoami) { + case LIS_DOUBLE_ID: + printk(KERN_INFO DRIVER_NAME ": 2-byte sensor found\n"); + dev->read_data = lis3lv02d_read_16; + dev->mdps_max_val = 2048; + break; + case LIS_SINGLE_ID: + printk(KERN_INFO DRIVER_NAME ": 1-byte sensor found\n"); + dev->read_data = lis3lv02d_read_8; + dev->mdps_max_val = 128; + break; + default: + printk(KERN_ERR DRIVER_NAME + ": unknown sensor type 0x%X\n", lis3_dev.whoami); + return -EINVAL; + } + mutex_init(&dev->lock); - lis3lv02d_add_fs(dev->device); + lis3lv02d_add_fs(dev); lis3lv02d_increase_use(dev); if (lis3lv02d_joystick_enable()) @@ -467,10 +495,10 @@ int lis3lv02d_init_device(struct acpi_lis3lv02d *dev) printk("lis3_init_device: irq %d\n", dev->irq); - /* if we did not get an IRQ from ACPI - we have nothing more to do */ + /* bail if we did not get an IRQ from the bus layer */ if (!dev->irq) { printk(KERN_ERR DRIVER_NAME - ": No IRQ in ACPI. Disabling /dev/freefall\n"); + ": No IRQ. Disabling /dev/freefall\n"); goto out; } diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h index 017fb2b754d5..745ec96806d4 100644 --- a/drivers/hwmon/lis3lv02d.h +++ b/drivers/hwmon/lis3lv02d.h @@ -159,14 +159,14 @@ struct axis_conversion { s8 z; }; -struct acpi_lis3lv02d { - struct acpi_device *device; /* The ACPI device */ - acpi_status (*init) (acpi_handle handle); - acpi_status (*write) (acpi_handle handle, int reg, u8 val); - acpi_status (*read) (acpi_handle handle, int reg, u8 *ret); +struct lis3lv02d { + void *bus_priv; /* used by the bus layer only */ + int (*init) (struct lis3lv02d *lis3); + int (*write) (struct lis3lv02d *lis3, int reg, u8 val); + int (*read) (struct lis3lv02d *lis3, int reg, u8 *ret); u8 whoami; /* 3Ah: 2-byte registries, 3Bh: 1-byte registries */ - s16 (*read_data) (acpi_handle handle, int reg); + s16 (*read_data) (struct lis3lv02d *lis3, int reg); int mdps_max_val; struct input_dev *idev; /* input device */ @@ -187,11 +187,11 @@ struct acpi_lis3lv02d { unsigned long misc_opened; /* bit0: whether the device is open */ }; -int lis3lv02d_init_device(struct acpi_lis3lv02d *dev); +int lis3lv02d_init_device(struct lis3lv02d *lis3); int lis3lv02d_joystick_enable(void); void lis3lv02d_joystick_disable(void); -void lis3lv02d_poweroff(acpi_handle handle); -void lis3lv02d_poweron(acpi_handle handle); +void lis3lv02d_poweroff(struct lis3lv02d *lis3); +void lis3lv02d_poweron(struct lis3lv02d *lis3); int lis3lv02d_remove_fs(void); -extern struct acpi_lis3lv02d lis3_dev; +extern struct lis3lv02d lis3_dev; From bb233fdfc7b7cefe45bfa2e8d1b24e79c60a48e5 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 31 Mar 2009 15:24:33 -0700 Subject: [PATCH 386/478] lis3: SPI transport layer Make use of the new abstraction layer and add a new transport layer for spi. Works fine on a PXA based board. Signed-off-by: Daniel Mack Acked-by: Pavel Machek Acked-by: Eric Piel Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/Kconfig | 16 +++++ drivers/hwmon/Makefile | 1 + drivers/hwmon/lis3lv02d_spi.c | 114 ++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 drivers/hwmon/lis3lv02d_spi.c diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index bc6810c22dd7..ce52bf2f235e 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -932,6 +932,22 @@ config SENSORS_LIS3LV02D Say Y here if you have an applicable laptop and want to experience the awesome power of lis3lv02d. +config SENSORS_LIS3_SPI + tristate "STMicroeletronics LIS3LV02Dx three-axis digital accelerometer (SPI)" + depends on !ACPI && SPI_MASTER && INPUT + default n + help + This driver provides support for the LIS3LV02Dx accelerometer connected + via SPI. The accelerometer data is readable via + /sys/devices/platform/lis3lv02d. + + This driver also provides an absolute input class device, allowing + the laptop to act as a pinball machine-esque joystick. + + This driver can also be built as modules. If so, the core module + will be called lis3lv02d and a specific module for the SPI transport + is called lis3lv02d_spi. + config SENSORS_APPLESMC tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)" depends on INPUT && X86 diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index 4da261d16abf..3a6b1f06f8f4 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_SENSORS_IBMPEX) += ibmpex.o obj-$(CONFIG_SENSORS_IT87) += it87.o obj-$(CONFIG_SENSORS_K8TEMP) += k8temp.o obj-$(CONFIG_SENSORS_LIS3LV02D) += lis3lv02d.o hp_accel.o +obj-$(CONFIG_SENSORS_LIS3_SPI) += lis3lv02d.o lis3lv02d_spi.o obj-$(CONFIG_SENSORS_LM63) += lm63.o obj-$(CONFIG_SENSORS_LM70) += lm70.o obj-$(CONFIG_SENSORS_LM75) += lm75.o diff --git a/drivers/hwmon/lis3lv02d_spi.c b/drivers/hwmon/lis3lv02d_spi.c new file mode 100644 index 000000000000..07ae74b0e191 --- /dev/null +++ b/drivers/hwmon/lis3lv02d_spi.c @@ -0,0 +1,114 @@ +/* + * lis3lv02d_spi - SPI glue layer for lis3lv02d + * + * Copyright (c) 2009 Daniel Mack + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * publishhed by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lis3lv02d.h" + +#define DRV_NAME "lis3lv02d_spi" +#define LIS3_SPI_READ 0x80 + +static int lis3_spi_read(struct lis3lv02d *lis3, int reg, u8 *v) +{ + struct spi_device *spi = lis3->bus_priv; + int ret = spi_w8r8(spi, reg | LIS3_SPI_READ); + if (ret < 0) + return -EINVAL; + + *v = (u8) ret; + return 0; +} + +static int lis3_spi_write(struct lis3lv02d *lis3, int reg, u8 val) +{ + u8 tmp[2] = { reg, val }; + struct spi_device *spi = lis3->bus_priv; + return spi_write(spi, tmp, sizeof(tmp)); +} + +static int lis3_spi_init(struct lis3lv02d *lis3) +{ + u8 reg; + int ret; + + /* power up the device */ + ret = lis3->read(lis3, CTRL_REG1, ®); + if (ret < 0) + return ret; + + reg |= CTRL1_PD0; + return lis3->write(lis3, CTRL_REG1, reg); +} + +static struct axis_conversion lis3lv02d_axis_normal = { 1, 2, 3 }; + +static int __devinit lis302dl_spi_probe(struct spi_device *spi) +{ + int ret; + + spi->bits_per_word = 8; + spi->mode = SPI_MODE_0; + ret = spi_setup(spi); + if (ret < 0) + return ret; + + lis3_dev.bus_priv = spi; + lis3_dev.init = lis3_spi_init; + lis3_dev.read = lis3_spi_read; + lis3_dev.write = lis3_spi_write; + lis3_dev.irq = spi->irq; + lis3_dev.ac = lis3lv02d_axis_normal; + spi_set_drvdata(spi, &lis3_dev); + + ret = lis3lv02d_init_device(&lis3_dev); + return ret; +} + +static int __devexit lis302dl_spi_remove(struct spi_device *spi) +{ + struct lis3lv02d *lis3 = spi_get_drvdata(spi); + lis3lv02d_joystick_disable(); + lis3lv02d_poweroff(lis3); + return 0; +} + +static struct spi_driver lis302dl_spi_driver = { + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + }, + .probe = lis302dl_spi_probe, + .remove = __devexit_p(lis302dl_spi_remove), +}; + +static int __init lis302dl_init(void) +{ + return spi_register_driver(&lis302dl_spi_driver); +} + +static void __exit lis302dl_exit(void) +{ + spi_unregister_driver(&lis302dl_spi_driver); +} + +module_init(lis302dl_init); +module_exit(lis302dl_exit); + +MODULE_AUTHOR("Daniel Mack "); +MODULE_DESCRIPTION("lis3lv02d SPI glue layer"); +MODULE_LICENSE("GPL"); + From c3b1b1cbf002e65a3cabd479e68b5f35886a26db Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 31 Mar 2009 15:24:34 -0700 Subject: [PATCH 387/478] ramfs: add support for "mode=" mount option Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12843 "I use ramfs instead of tmpfs for /tmp because I don't use swap on my laptop. Some apps need 1777 mode for /tmp directory, but ramfs does not support 'mode=' mount option." Reported-by: Avan Anishchuk Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 94 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 8 deletions(-) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b7e6ac706b87..a404fb88e456 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -33,12 +33,15 @@ #include #include #include +#include #include #include "internal.h" /* some random number */ #define RAMFS_MAGIC 0x858458f6 +#define RAMFS_DEFAULT_MODE 0755 + static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; @@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = { static const struct super_operations ramfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, + .show_options = generic_show_options, }; +struct ramfs_mount_opts { + umode_t mode; +}; + +enum { + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct ramfs_fs_info { + struct ramfs_mount_opts mount_opts; +}; + +static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + char *p; + + opts->mode = RAMFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + default: + printk(KERN_ERR "ramfs: bad mount option: %s\n", p); + return -EINVAL; + } + } + + return 0; +} + static int ramfs_fill_super(struct super_block * sb, void * data, int silent) { - struct inode * inode; - struct dentry * root; + struct ramfs_fs_info *fsi; + struct inode *inode = NULL; + struct dentry *root; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + if (!fsi) { + err = -ENOMEM; + goto fail; + } + sb->s_fs_info = fsi; + + err = ramfs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_CACHE_SIZE; @@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) sb->s_magic = RAMFS_MAGIC; sb->s_op = &ramfs_ops; sb->s_time_gran = 1; - inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0); - if (!inode) - return -ENOMEM; + inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); + if (!inode) { + err = -ENOMEM; + goto fail; + } root = d_alloc_root(inode); if (!root) { - iput(inode); - return -ENOMEM; + err = -ENOMEM; + goto fail; } sb->s_root = root; return 0; +fail: + kfree(fsi); + iput(inode); + return err; } int ramfs_get_sb(struct file_system_type *fs_type, @@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type, mnt); } +static void ramfs_kill_sb(struct super_block *sb) +{ + kfree(sb->s_fs_info); + kill_litter_super(sb); +} + static struct file_system_type ramfs_fs_type = { .name = "ramfs", .get_sb = ramfs_get_sb, - .kill_sb = kill_litter_super, + .kill_sb = ramfs_kill_sb, }; static struct file_system_type rootfs_fs_type = { .name = "rootfs", From 34c8a20c6ee6af25ee35da9ca15ba81faacfc73d Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 31 Mar 2009 15:24:35 -0700 Subject: [PATCH 388/478] spi_mpc83xx: fix sparse warnings The patch fixes following sparse warnings: CHECK spi_mpc83xx.c spi_mpc83xx.c:145:1: warning: symbol 'mpc83xx_spi_rx_buf_u8' was not declared. Should it be static? spi_mpc83xx.c:146:1: warning: symbol 'mpc83xx_spi_rx_buf_u16' was not declared. Should it be static? spi_mpc83xx.c:147:1: warning: symbol 'mpc83xx_spi_rx_buf_u32' was not declared. Should it be static? spi_mpc83xx.c:148:1: warning: symbol 'mpc83xx_spi_tx_buf_u8' was not declared. Should it be static? spi_mpc83xx.c:149:1: warning: symbol 'mpc83xx_spi_tx_buf_u16' was not declared. Should it be static? spi_mpc83xx.c:150:1: warning: symbol 'mpc83xx_spi_tx_buf_u32' was not declared. Should it be static? spi_mpc83xx.c:175:32: warning: incorrect type in initializer (different address spaces) spi_mpc83xx.c:175:32: expected void *tmp_ptr spi_mpc83xx.c:175:32: got unsigned int [noderef] * spi_mpc83xx.c:183:26: warning: incorrect type in argument 1 (different address spaces) spi_mpc83xx.c:183:26: expected unsigned int [noderef] [usertype] *reg spi_mpc83xx.c:183:26: got void *tmp_ptr spi_mpc83xx.c:184:26: warning: incorrect type in argument 1 (different address spaces) spi_mpc83xx.c:184:26: expected unsigned int [noderef] [usertype] *reg spi_mpc83xx.c:184:26: got void *tmp_ptr spi_mpc83xx.c:287:31: warning: incorrect type in initializer (different address spaces) spi_mpc83xx.c:287:31: expected void *tmp_ptr spi_mpc83xx.c:287:31: got unsigned int [noderef] * spi_mpc83xx.c:295:25: warning: incorrect type in argument 1 (different address spaces) spi_mpc83xx.c:295:25: expected unsigned int [noderef] [usertype] *reg spi_mpc83xx.c:295:25: got void *tmp_ptr spi_mpc83xx.c:296:25: warning: incorrect type in argument 1 (different address spaces) spi_mpc83xx.c:296:25: expected unsigned int [noderef] [usertype] *reg spi_mpc83xx.c:296:25: got void *tmp_ptr spi_mpc83xx.c:486:13: warning: symbol 'mpc83xx_spi_irq' was not declared. Should it be static? Signed-off-by: Anton Vorontsov Cc: David Brownell Cc: Benjamin Herrenschmidt Cc: Kumar Gala Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/spi/spi_mpc83xx.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c index 44a2b46ccb79..df6420029004 100644 --- a/drivers/spi/spi_mpc83xx.c +++ b/drivers/spi/spi_mpc83xx.c @@ -123,6 +123,7 @@ static inline u32 mpc83xx_spi_read_reg(__be32 __iomem * reg) } #define MPC83XX_SPI_RX_BUF(type) \ +static \ void mpc83xx_spi_rx_buf_##type(u32 data, struct mpc83xx_spi *mpc83xx_spi) \ { \ type * rx = mpc83xx_spi->rx; \ @@ -131,6 +132,7 @@ void mpc83xx_spi_rx_buf_##type(u32 data, struct mpc83xx_spi *mpc83xx_spi) \ } #define MPC83XX_SPI_TX_BUF(type) \ +static \ u32 mpc83xx_spi_tx_buf_##type(struct mpc83xx_spi *mpc83xx_spi) \ { \ u32 data; \ @@ -172,7 +174,7 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value) if (cs->hw_mode != regval) { unsigned long flags; - void *tmp_ptr = &mpc83xx_spi->base->mode; + __be32 __iomem *mode = &mpc83xx_spi->base->mode; regval = cs->hw_mode; /* Turn off IRQs locally to minimize time that @@ -180,8 +182,8 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value) */ local_irq_save(flags); /* Turn off SPI unit prior changing mode */ - mpc83xx_spi_write_reg(tmp_ptr, regval & ~SPMODE_ENABLE); - mpc83xx_spi_write_reg(tmp_ptr, regval); + mpc83xx_spi_write_reg(mode, regval & ~SPMODE_ENABLE); + mpc83xx_spi_write_reg(mode, regval); local_irq_restore(flags); } if (mpc83xx_spi->activate_cs) @@ -284,7 +286,7 @@ int mpc83xx_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t) regval = mpc83xx_spi_read_reg(&mpc83xx_spi->base->mode); if (cs->hw_mode != regval) { unsigned long flags; - void *tmp_ptr = &mpc83xx_spi->base->mode; + __be32 __iomem *mode = &mpc83xx_spi->base->mode; regval = cs->hw_mode; /* Turn off IRQs locally to minimize time @@ -292,8 +294,8 @@ int mpc83xx_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t) */ local_irq_save(flags); /* Turn off SPI unit prior changing mode */ - mpc83xx_spi_write_reg(tmp_ptr, regval & ~SPMODE_ENABLE); - mpc83xx_spi_write_reg(tmp_ptr, regval); + mpc83xx_spi_write_reg(mode, regval & ~SPMODE_ENABLE); + mpc83xx_spi_write_reg(mode, regval); local_irq_restore(flags); } return 0; @@ -483,7 +485,7 @@ static int mpc83xx_spi_setup(struct spi_device *spi) return 0; } -irqreturn_t mpc83xx_spi_irq(s32 irq, void *context_data) +static irqreturn_t mpc83xx_spi_irq(s32 irq, void *context_data) { struct mpc83xx_spi *mpc83xx_spi = context_data; u32 event; From 364fdbc00fbdd409ade63500710123fe323aa164 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 31 Mar 2009 15:24:36 -0700 Subject: [PATCH 389/478] spi_mpc83xx: rework chip selects handling The main purpose of this patch is to pass 'struct spi_device' to the chip select handling routines. This is needed so that we could implement full-fledged OpenFirmware support for this driver. While at it, also: - Replace two {de,activate}_cs routines by single cs_contol(). - Don't duplicate platform data callbacks in mpc83xx_spi struct. Signed-off-by: Anton Vorontsov Cc: David Brownell Cc: Benjamin Herrenschmidt Cc: Kumar Gala Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/83xx/mpc832x_rdb.c | 16 ++++------------ arch/powerpc/sysdev/fsl_soc.c | 14 ++++++-------- arch/powerpc/sysdev/fsl_soc.h | 5 +++-- drivers/spi/spi_mpc83xx.c | 20 +++++++------------- include/linux/fsl_devices.h | 5 +++-- 5 files changed, 23 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c index 2a1295f19832..28e23cde64a1 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c @@ -39,16 +39,10 @@ #endif #ifdef CONFIG_QUICC_ENGINE -static void mpc83xx_spi_activate_cs(u8 cs, u8 polarity) +static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on) { - pr_debug("%s %d %d\n", __func__, cs, polarity); - par_io_data_set(3, 13, polarity); -} - -static void mpc83xx_spi_deactivate_cs(u8 cs, u8 polarity) -{ - pr_debug("%s %d %d\n", __func__, cs, polarity); - par_io_data_set(3, 13, !polarity); + pr_debug("%s %d %d\n", __func__, spi->chip_select, on); + par_io_data_set(3, 13, on); } static struct mmc_spi_platform_data mpc832x_mmc_pdata = { @@ -74,9 +68,7 @@ static int __init mpc832x_spi_init(void) par_io_config_pin(3, 14, 2, 0, 0, 0); /* SD_INSERT, I */ par_io_config_pin(3, 15, 2, 0, 0, 0); /* SD_PROTECT,I */ - return fsl_spi_init(&mpc832x_spi_boardinfo, 1, - mpc83xx_spi_activate_cs, - mpc83xx_spi_deactivate_cs); + return fsl_spi_init(&mpc832x_spi_boardinfo, 1, mpc83xx_spi_cs_control); } machine_device_initcall(mpc832x_rdb, mpc832x_spi_init); #endif /* CONFIG_QUICC_ENGINE */ diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index a01c89d3f9bd..a46c1c867930 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -420,8 +420,8 @@ arch_initcall(fsl_usb_of_init); static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk, struct spi_board_info *board_infos, unsigned int num_board_infos, - void (*activate_cs)(u8 cs, u8 polarity), - void (*deactivate_cs)(u8 cs, u8 polarity)) + void (*cs_control)(struct spi_device *dev, + bool on)) { struct device_node *np; unsigned int i = 0; @@ -433,8 +433,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk, struct resource res[2]; struct platform_device *pdev; struct fsl_spi_platform_data pdata = { - .activate_cs = activate_cs, - .deactivate_cs = deactivate_cs, + .cs_control = cs_control, }; memset(res, 0, sizeof(res)); @@ -501,8 +500,7 @@ next: int __init fsl_spi_init(struct spi_board_info *board_infos, unsigned int num_board_infos, - void (*activate_cs)(u8 cs, u8 polarity), - void (*deactivate_cs)(u8 cs, u8 polarity)) + void (*cs_control)(struct spi_device *spi, bool on)) { u32 sysclk = -1; int ret; @@ -518,10 +516,10 @@ int __init fsl_spi_init(struct spi_board_info *board_infos, } ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos, - num_board_infos, activate_cs, deactivate_cs); + num_board_infos, cs_control); if (!ret) of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos, - num_board_infos, activate_cs, deactivate_cs); + num_board_infos, cs_control); return spi_register_board_info(board_infos, num_board_infos); } diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h index 9c744e4285a0..b5f3456780b8 100644 --- a/arch/powerpc/sysdev/fsl_soc.h +++ b/arch/powerpc/sysdev/fsl_soc.h @@ -4,6 +4,8 @@ #include +struct spi_device; + extern phys_addr_t get_immrbase(void); #if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx) extern u32 get_brgfreq(void); @@ -19,8 +21,7 @@ struct device_node; extern int fsl_spi_init(struct spi_board_info *board_infos, unsigned int num_board_infos, - void (*activate_cs)(u8 cs, u8 polarity), - void (*deactivate_cs)(u8 cs, u8 polarity)); + void (*cs_control)(struct spi_device *spi, bool on)); extern void fsl_rstcr_restart(char *cmd); diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c index df6420029004..b95085a46f90 100644 --- a/drivers/spi/spi_mpc83xx.c +++ b/drivers/spi/spi_mpc83xx.c @@ -89,9 +89,6 @@ struct mpc83xx_spi { bool qe_mode; - void (*activate_cs) (u8 cs, u8 polarity); - void (*deactivate_cs) (u8 cs, u8 polarity); - u8 busy; struct workqueue_struct *workqueue; @@ -153,15 +150,14 @@ MPC83XX_SPI_TX_BUF(u32) static void mpc83xx_spi_chipselect(struct spi_device *spi, int value) { - struct mpc83xx_spi *mpc83xx_spi; - u8 pol = spi->mode & SPI_CS_HIGH ? 1 : 0; + struct mpc83xx_spi *mpc83xx_spi = spi_master_get_devdata(spi->master); + struct fsl_spi_platform_data *pdata = spi->dev.parent->platform_data; + bool pol = spi->mode & SPI_CS_HIGH; struct spi_mpc83xx_cs *cs = spi->controller_state; - mpc83xx_spi = spi_master_get_devdata(spi->master); - if (value == BITBANG_CS_INACTIVE) { - if (mpc83xx_spi->deactivate_cs) - mpc83xx_spi->deactivate_cs(spi->chip_select, pol); + if (pdata->cs_control) + pdata->cs_control(spi, !pol); } if (value == BITBANG_CS_ACTIVE) { @@ -186,8 +182,8 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value) mpc83xx_spi_write_reg(mode, regval); local_irq_restore(flags); } - if (mpc83xx_spi->activate_cs) - mpc83xx_spi->activate_cs(spi->chip_select, pol); + if (pdata->cs_control) + pdata->cs_control(spi, pol); } } @@ -582,8 +578,6 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev) master->cleanup = mpc83xx_spi_cleanup; mpc83xx_spi = spi_master_get_devdata(master); - mpc83xx_spi->activate_cs = pdata->activate_cs; - mpc83xx_spi->deactivate_cs = pdata->deactivate_cs; mpc83xx_spi->qe_mode = pdata->qe_mode; mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u8; mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u8; diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h index d9051d717d27..7bc1b643d370 100644 --- a/include/linux/fsl_devices.h +++ b/include/linux/fsl_devices.h @@ -95,14 +95,15 @@ struct fsl_usb2_platform_data { #define FSL_USB2_PORT0_ENABLED 0x00000001 #define FSL_USB2_PORT1_ENABLED 0x00000002 +struct spi_device; + struct fsl_spi_platform_data { u32 initial_spmode; /* initial SPMODE value */ u16 bus_num; bool qe_mode; /* board specific information */ u16 max_chipselect; - void (*activate_cs)(u8 cs, u8 polarity); - void (*deactivate_cs)(u8 cs, u8 polarity); + void (*cs_control)(struct spi_device *spi, bool on); u32 sysclk; }; From 35b4b3c0c1265f1a7342574be393c157601401f0 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 31 Mar 2009 15:24:37 -0700 Subject: [PATCH 390/478] spi_mpc83xx: add OF platform driver bindings Implement full support for OF SPI bindings. Now the driver can manage its own chip selects without any help from the board files and/or fsl_soc constructors. The "legacy" code is well isolated and could be removed as time goes by. Signed-off-by: Anton Vorontsov Cc: David Brownell Cc: Benjamin Herrenschmidt Cc: Kumar Gala Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/spi/spi_mpc83xx.c | 330 ++++++++++++++++++++++++++++++++---- include/linux/fsl_devices.h | 2 +- 2 files changed, 295 insertions(+), 37 deletions(-) diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c index b95085a46f90..f4573a96af24 100644 --- a/drivers/spi/spi_mpc83xx.c +++ b/drivers/spi/spi_mpc83xx.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,7 +25,13 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -79,7 +87,7 @@ struct mpc83xx_spi { u32(*get_tx) (struct mpc83xx_spi *); unsigned int count; - int irq; + unsigned int irq; unsigned nsecs; /* (clock cycle time)/2 */ @@ -543,36 +551,23 @@ static void mpc83xx_spi_cleanup(struct spi_device *spi) kfree(spi->controller_state); } -static int __init mpc83xx_spi_probe(struct platform_device *dev) +static struct spi_master * __devinit +mpc83xx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq) { + struct fsl_spi_platform_data *pdata = dev->platform_data; struct spi_master *master; struct mpc83xx_spi *mpc83xx_spi; - struct fsl_spi_platform_data *pdata; - struct resource *r; u32 regval; int ret = 0; - /* Get resources(memory, IRQ) associated with the device */ - master = spi_alloc_master(&dev->dev, sizeof(struct mpc83xx_spi)); - + master = spi_alloc_master(dev, sizeof(struct mpc83xx_spi)); if (master == NULL) { ret = -ENOMEM; goto err; } - platform_set_drvdata(dev, master); - pdata = dev->dev.platform_data; + dev_set_drvdata(dev, master); - if (pdata == NULL) { - ret = -ENODEV; - goto free_master; - } - - r = platform_get_resource(dev, IORESOURCE_MEM, 0); - if (r == NULL) { - ret = -ENODEV; - goto free_master; - } master->setup = mpc83xx_spi_setup; master->transfer = mpc83xx_spi_transfer; master->cleanup = mpc83xx_spi_cleanup; @@ -592,18 +587,13 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev) init_completion(&mpc83xx_spi->done); - mpc83xx_spi->base = ioremap(r->start, r->end - r->start + 1); + mpc83xx_spi->base = ioremap(mem->start, mem->end - mem->start + 1); if (mpc83xx_spi->base == NULL) { ret = -ENOMEM; goto put_master; } - mpc83xx_spi->irq = platform_get_irq(dev, 0); - - if (mpc83xx_spi->irq < 0) { - ret = -ENXIO; - goto unmap_io; - } + mpc83xx_spi->irq = irq; /* Register for SPI Interrupt */ ret = request_irq(mpc83xx_spi->irq, mpc83xx_spi_irq, @@ -645,9 +635,9 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev) printk(KERN_INFO "%s: MPC83xx SPI Controller driver at 0x%p (irq = %d)\n", - dev_name(&dev->dev), mpc83xx_spi->base, mpc83xx_spi->irq); + dev_name(dev), mpc83xx_spi->base, mpc83xx_spi->irq); - return ret; + return master; unreg_master: destroy_workqueue(mpc83xx_spi->workqueue); @@ -657,18 +647,16 @@ unmap_io: iounmap(mpc83xx_spi->base); put_master: spi_master_put(master); -free_master: - kfree(master); err: - return ret; + return ERR_PTR(ret); } -static int __exit mpc83xx_spi_remove(struct platform_device *dev) +static int __devexit mpc83xx_spi_remove(struct device *dev) { struct mpc83xx_spi *mpc83xx_spi; struct spi_master *master; - master = platform_get_drvdata(dev); + master = dev_get_drvdata(dev); mpc83xx_spi = spi_master_get_devdata(master); flush_workqueue(mpc83xx_spi->workqueue); @@ -681,23 +669,293 @@ static int __exit mpc83xx_spi_remove(struct platform_device *dev) return 0; } +struct mpc83xx_spi_probe_info { + struct fsl_spi_platform_data pdata; + int *gpios; + bool *alow_flags; +}; + +static struct mpc83xx_spi_probe_info * +to_of_pinfo(struct fsl_spi_platform_data *pdata) +{ + return container_of(pdata, struct mpc83xx_spi_probe_info, pdata); +} + +static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on) +{ + struct device *dev = spi->dev.parent; + struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(dev->platform_data); + u16 cs = spi->chip_select; + int gpio = pinfo->gpios[cs]; + bool alow = pinfo->alow_flags[cs]; + + gpio_set_value(gpio, on ^ alow); +} + +static int of_mpc83xx_spi_get_chipselects(struct device *dev) +{ + struct device_node *np = dev_archdata_get_node(&dev->archdata); + struct fsl_spi_platform_data *pdata = dev->platform_data; + struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata); + unsigned int ngpios; + int i = 0; + int ret; + + ngpios = of_gpio_count(np); + if (!ngpios) { + /* + * SPI w/o chip-select line. One SPI device is still permitted + * though. + */ + pdata->max_chipselect = 1; + return 0; + } + + pinfo->gpios = kmalloc(ngpios * sizeof(pinfo->gpios), GFP_KERNEL); + if (!pinfo->gpios) + return -ENOMEM; + memset(pinfo->gpios, -1, ngpios * sizeof(pinfo->gpios)); + + pinfo->alow_flags = kzalloc(ngpios * sizeof(pinfo->alow_flags), + GFP_KERNEL); + if (!pinfo->alow_flags) { + ret = -ENOMEM; + goto err_alloc_flags; + } + + for (; i < ngpios; i++) { + int gpio; + enum of_gpio_flags flags; + + gpio = of_get_gpio_flags(np, i, &flags); + if (!gpio_is_valid(gpio)) { + dev_err(dev, "invalid gpio #%d: %d\n", i, gpio); + goto err_loop; + } + + ret = gpio_request(gpio, dev_name(dev)); + if (ret) { + dev_err(dev, "can't request gpio #%d: %d\n", i, ret); + goto err_loop; + } + + pinfo->gpios[i] = gpio; + pinfo->alow_flags[i] = flags & OF_GPIO_ACTIVE_LOW; + + ret = gpio_direction_output(pinfo->gpios[i], + pinfo->alow_flags[i]); + if (ret) { + dev_err(dev, "can't set output direction for gpio " + "#%d: %d\n", i, ret); + goto err_loop; + } + } + + pdata->max_chipselect = ngpios; + pdata->cs_control = mpc83xx_spi_cs_control; + + return 0; + +err_loop: + while (i >= 0) { + if (gpio_is_valid(pinfo->gpios[i])) + gpio_free(pinfo->gpios[i]); + i--; + } + + kfree(pinfo->alow_flags); + pinfo->alow_flags = NULL; +err_alloc_flags: + kfree(pinfo->gpios); + pinfo->gpios = NULL; + return ret; +} + +static int of_mpc83xx_spi_free_chipselects(struct device *dev) +{ + struct fsl_spi_platform_data *pdata = dev->platform_data; + struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata); + int i; + + if (!pinfo->gpios) + return 0; + + for (i = 0; i < pdata->max_chipselect; i++) { + if (gpio_is_valid(pinfo->gpios[i])) + gpio_free(pinfo->gpios[i]); + } + + kfree(pinfo->gpios); + kfree(pinfo->alow_flags); + return 0; +} + +static int __devinit of_mpc83xx_spi_probe(struct of_device *ofdev, + const struct of_device_id *ofid) +{ + struct device *dev = &ofdev->dev; + struct device_node *np = ofdev->node; + struct mpc83xx_spi_probe_info *pinfo; + struct fsl_spi_platform_data *pdata; + struct spi_master *master; + struct resource mem; + struct resource irq; + const void *prop; + int ret = -ENOMEM; + + pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL); + if (!pinfo) + return -ENOMEM; + + pdata = &pinfo->pdata; + dev->platform_data = pdata; + + /* Allocate bus num dynamically. */ + pdata->bus_num = -1; + + /* SPI controller is either clocked from QE or SoC clock. */ + pdata->sysclk = get_brgfreq(); + if (pdata->sysclk == -1) { + pdata->sysclk = fsl_get_sys_freq(); + if (pdata->sysclk == -1) { + ret = -ENODEV; + goto err_clk; + } + } + + prop = of_get_property(np, "mode", NULL); + if (prop && !strcmp(prop, "cpu-qe")) + pdata->qe_mode = 1; + + ret = of_mpc83xx_spi_get_chipselects(dev); + if (ret) + goto err; + + ret = of_address_to_resource(np, 0, &mem); + if (ret) + goto err; + + ret = of_irq_to_resource(np, 0, &irq); + if (!ret) { + ret = -EINVAL; + goto err; + } + + master = mpc83xx_spi_probe(dev, &mem, irq.start); + if (IS_ERR(master)) { + ret = PTR_ERR(master); + goto err; + } + + of_register_spi_devices(master, np); + + return 0; + +err: + of_mpc83xx_spi_free_chipselects(dev); +err_clk: + kfree(pinfo); + return ret; +} + +static int __devexit of_mpc83xx_spi_remove(struct of_device *ofdev) +{ + int ret; + + ret = mpc83xx_spi_remove(&ofdev->dev); + if (ret) + return ret; + of_mpc83xx_spi_free_chipselects(&ofdev->dev); + return 0; +} + +static const struct of_device_id of_mpc83xx_spi_match[] = { + { .compatible = "fsl,spi" }, + {}, +}; +MODULE_DEVICE_TABLE(of, of_mpc83xx_spi_match); + +static struct of_platform_driver of_mpc83xx_spi_driver = { + .name = "mpc83xx_spi", + .match_table = of_mpc83xx_spi_match, + .probe = of_mpc83xx_spi_probe, + .remove = __devexit_p(of_mpc83xx_spi_remove), +}; + +#ifdef CONFIG_MPC832x_RDB +/* + * XXX XXX XXX + * This is "legacy" platform driver, was used by the MPC8323E-RDB boards + * only. The driver should go away soon, since newer MPC8323E-RDB's device + * tree can work with OpenFirmware driver. But for now we support old trees + * as well. + */ +static int __devinit plat_mpc83xx_spi_probe(struct platform_device *pdev) +{ + struct resource *mem; + unsigned int irq; + struct spi_master *master; + + if (!pdev->dev.platform_data) + return -EINVAL; + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!mem) + return -EINVAL; + + irq = platform_get_irq(pdev, 0); + if (!irq) + return -EINVAL; + + master = mpc83xx_spi_probe(&pdev->dev, mem, irq); + if (IS_ERR(master)) + return PTR_ERR(master); + return 0; +} + +static int __devexit plat_mpc83xx_spi_remove(struct platform_device *pdev) +{ + return mpc83xx_spi_remove(&pdev->dev); +} + MODULE_ALIAS("platform:mpc83xx_spi"); static struct platform_driver mpc83xx_spi_driver = { - .remove = __exit_p(mpc83xx_spi_remove), + .probe = plat_mpc83xx_spi_probe, + .remove = __exit_p(plat_mpc83xx_spi_remove), .driver = { .name = "mpc83xx_spi", .owner = THIS_MODULE, }, }; +static bool legacy_driver_failed; + +static void __init legacy_driver_register(void) +{ + legacy_driver_failed = platform_driver_register(&mpc83xx_spi_driver); +} + +static void __exit legacy_driver_unregister(void) +{ + if (legacy_driver_failed) + return; + platform_driver_unregister(&mpc83xx_spi_driver); +} +#else +static void __init legacy_driver_register(void) {} +static void __exit legacy_driver_unregister(void) {} +#endif /* CONFIG_MPC832x_RDB */ + static int __init mpc83xx_spi_init(void) { - return platform_driver_probe(&mpc83xx_spi_driver, mpc83xx_spi_probe); + legacy_driver_register(); + return of_register_platform_driver(&of_mpc83xx_spi_driver); } static void __exit mpc83xx_spi_exit(void) { - platform_driver_unregister(&mpc83xx_spi_driver); + of_unregister_platform_driver(&of_mpc83xx_spi_driver); + legacy_driver_unregister(); } module_init(mpc83xx_spi_init); diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h index 7bc1b643d370..7ef1caf50269 100644 --- a/include/linux/fsl_devices.h +++ b/include/linux/fsl_devices.h @@ -99,7 +99,7 @@ struct spi_device; struct fsl_spi_platform_data { u32 initial_spmode; /* initial SPMODE value */ - u16 bus_num; + s16 bus_num; bool qe_mode; /* board specific information */ u16 max_chipselect; From 3f1c6ebf57b815ad709e89291e446935fee78f75 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 31 Mar 2009 15:24:38 -0700 Subject: [PATCH 391/478] powerpc: add mmc-spi-slot bindings The bindings describes a case where MMC/SD/SDIO slot directly connected to a SPI bus. Such setups are widely used on embedded PowerPC boards. The patch also adds the mmc-spi-slot entry to the OpenFirmware modalias table. Signed-off-by: Anton Vorontsov Cc: David Brownell Cc: Benjamin Herrenschmidt Cc: Kumar Gala Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../powerpc/dts-bindings/mmc-spi-slot.txt | 23 +++++++++++++++++++ drivers/of/base.c | 1 + 2 files changed, 24 insertions(+) create mode 100644 Documentation/powerpc/dts-bindings/mmc-spi-slot.txt diff --git a/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt new file mode 100644 index 000000000000..c39ac2891951 --- /dev/null +++ b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt @@ -0,0 +1,23 @@ +MMC/SD/SDIO slot directly connected to a SPI bus + +Required properties: +- compatible : should be "mmc-spi-slot". +- reg : should specify SPI address (chip-select number). +- spi-max-frequency : maximum frequency for this device (Hz). +- voltage-ranges : two cells are required, first cell specifies minimum + slot voltage (mV), second cell specifies maximum slot voltage (mV). + Several ranges could be specified. +- gpios : (optional) may specify GPIOs in this order: Card-Detect GPIO, + Write-Protect GPIO. + +Example: + + mmc-slot@0 { + compatible = "fsl,mpc8323rdb-mmc-slot", + "mmc-spi-slot"; + reg = <0>; + gpios = <&qe_pio_d 14 1 + &qe_pio_d 15 0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <50000000>; + }; diff --git a/drivers/of/base.c b/drivers/of/base.c index cd17092b82bd..41c5dfd85358 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -446,6 +446,7 @@ struct of_modalias_table { }; static struct of_modalias_table of_modalias_table[] = { { "fsl,mcu-mpc8349emitx", "mcu-mpc8349emitx" }, + { "mmc-spi-slot", "mmc_spi" }, }; /** From 754582853120a9ec8b8293b5147b605b1c6a39f1 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 31 Mar 2009 15:24:39 -0700 Subject: [PATCH 392/478] powerpc/83xx: add mmc-spi support via the device tree for MPC8323E-RDB - Add gpio-controller node to manage QE GPIO Bank D; - Add mmc-spi node; - Modify board file so that it won't use legacy SPI support with the new device trees. Signed-off-by: Anton Vorontsov Cc: David Brownell Cc: Benjamin Herrenschmidt Cc: Kumar Gala Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/boot/dts/mpc832x_rdb.dts | 24 +++++++++++++++++++++++ arch/powerpc/platforms/83xx/mpc832x_rdb.c | 6 ++++++ 2 files changed, 30 insertions(+) diff --git a/arch/powerpc/boot/dts/mpc832x_rdb.dts b/arch/powerpc/boot/dts/mpc832x_rdb.dts index dea30910c136..4319bd70a580 100644 --- a/arch/powerpc/boot/dts/mpc832x_rdb.dts +++ b/arch/powerpc/boot/dts/mpc832x_rdb.dts @@ -152,10 +152,21 @@ }; par_io@1400 { + #address-cells = <1>; + #size-cells = <1>; reg = <0x1400 0x100>; + ranges = <3 0x1448 0x18>; + compatible = "fsl,mpc8323-qe-pario"; device_type = "par_io"; num-ports = <7>; + qe_pio_d: gpio-controller@1448 { + #gpio-cells = <2>; + compatible = "fsl,mpc8323-qe-pario-bank"; + reg = <3 0x18>; + gpio-controller; + }; + ucc2pio:ucc_pin@02 { pio-map = < /* port pin dir open_drain assignment has_irq */ @@ -225,12 +236,25 @@ }; spi@4c0 { + #address-cells = <1>; + #size-cells = <0>; cell-index = <0>; compatible = "fsl,spi"; reg = <0x4c0 0x40>; interrupts = <2>; interrupt-parent = <&qeic>; + gpios = <&qe_pio_d 13 0>; mode = "cpu-qe"; + + mmc-slot@0 { + compatible = "fsl,mpc8323rdb-mmc-slot", + "mmc-spi-slot"; + reg = <0>; + gpios = <&qe_pio_d 14 1 + &qe_pio_d 15 0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <50000000>; + }; }; spi@500 { diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c index 28e23cde64a1..e1e7eeb7d3a3 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c @@ -68,6 +68,12 @@ static int __init mpc832x_spi_init(void) par_io_config_pin(3, 14, 2, 0, 0, 0); /* SD_INSERT, I */ par_io_config_pin(3, 15, 2, 0, 0, 0); /* SD_PROTECT,I */ + /* + * Don't bother with legacy stuff when device tree contains + * mmc-spi-slot node. + */ + if (of_find_compatible_node(NULL, NULL, "mmc-spi-slot")) + return 0; return fsl_spi_init(&mpc832x_spi_boardinfo, 1, mpc83xx_spi_cs_control); } machine_device_initcall(mpc832x_rdb, mpc832x_spi_init); From e2801806de1c9c1d03b7d1bfcb2e01dd4d389e80 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 31 Mar 2009 15:24:40 -0700 Subject: [PATCH 393/478] powerpc/fsl_soc: isolate legacy fsl_spi support to mpc832x_rdb boards The advantages of this: - Don't encourage legacy support; - Less external symbols, less code to compile-in for !MPC832x_RDB platforms. Signed-off-by: Anton Vorontsov Cc: David Brownell Cc: Benjamin Herrenschmidt Cc: Kumar Gala Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/83xx/mpc832x_rdb.c | 107 ++++++++++++++++++++++ arch/powerpc/sysdev/fsl_soc.c | 107 ---------------------- arch/powerpc/sysdev/fsl_soc.h | 4 - 3 files changed, 107 insertions(+), 111 deletions(-) diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c index e1e7eeb7d3a3..567ded7c3b9b 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,112 @@ #endif #ifdef CONFIG_QUICC_ENGINE +static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk, + struct spi_board_info *board_infos, + unsigned int num_board_infos, + void (*cs_control)(struct spi_device *dev, + bool on)) +{ + struct device_node *np; + unsigned int i = 0; + + for_each_compatible_node(np, type, compatible) { + int ret; + unsigned int j; + const void *prop; + struct resource res[2]; + struct platform_device *pdev; + struct fsl_spi_platform_data pdata = { + .cs_control = cs_control, + }; + + memset(res, 0, sizeof(res)); + + pdata.sysclk = sysclk; + + prop = of_get_property(np, "reg", NULL); + if (!prop) + goto err; + pdata.bus_num = *(u32 *)prop; + + prop = of_get_property(np, "cell-index", NULL); + if (prop) + i = *(u32 *)prop; + + prop = of_get_property(np, "mode", NULL); + if (prop && !strcmp(prop, "cpu-qe")) + pdata.qe_mode = 1; + + for (j = 0; j < num_board_infos; j++) { + if (board_infos[j].bus_num == pdata.bus_num) + pdata.max_chipselect++; + } + + if (!pdata.max_chipselect) + continue; + + ret = of_address_to_resource(np, 0, &res[0]); + if (ret) + goto err; + + ret = of_irq_to_resource(np, 0, &res[1]); + if (ret == NO_IRQ) + goto err; + + pdev = platform_device_alloc("mpc83xx_spi", i); + if (!pdev) + goto err; + + ret = platform_device_add_data(pdev, &pdata, sizeof(pdata)); + if (ret) + goto unreg; + + ret = platform_device_add_resources(pdev, res, + ARRAY_SIZE(res)); + if (ret) + goto unreg; + + ret = platform_device_add(pdev); + if (ret) + goto unreg; + + goto next; +unreg: + platform_device_del(pdev); +err: + pr_err("%s: registration failed\n", np->full_name); +next: + i++; + } + + return i; +} + +static int __init fsl_spi_init(struct spi_board_info *board_infos, + unsigned int num_board_infos, + void (*cs_control)(struct spi_device *spi, + bool on)) +{ + u32 sysclk = -1; + int ret; + + /* SPI controller is either clocked from QE or SoC clock */ + sysclk = get_brgfreq(); + if (sysclk == -1) { + sysclk = fsl_get_sys_freq(); + if (sysclk == -1) + return -ENODEV; + } + + ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos, + num_board_infos, cs_control); + if (!ret) + of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos, + num_board_infos, cs_control); + + return spi_register_board_info(board_infos, num_board_infos); +} + static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on) { pr_debug("%s %d %d\n", __func__, spi->chip_select, on); diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index a46c1c867930..afe8dbc964aa 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -417,113 +417,6 @@ err: arch_initcall(fsl_usb_of_init); -static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk, - struct spi_board_info *board_infos, - unsigned int num_board_infos, - void (*cs_control)(struct spi_device *dev, - bool on)) -{ - struct device_node *np; - unsigned int i = 0; - - for_each_compatible_node(np, type, compatible) { - int ret; - unsigned int j; - const void *prop; - struct resource res[2]; - struct platform_device *pdev; - struct fsl_spi_platform_data pdata = { - .cs_control = cs_control, - }; - - memset(res, 0, sizeof(res)); - - pdata.sysclk = sysclk; - - prop = of_get_property(np, "reg", NULL); - if (!prop) - goto err; - pdata.bus_num = *(u32 *)prop; - - prop = of_get_property(np, "cell-index", NULL); - if (prop) - i = *(u32 *)prop; - - prop = of_get_property(np, "mode", NULL); - if (prop && !strcmp(prop, "cpu-qe")) - pdata.qe_mode = 1; - - for (j = 0; j < num_board_infos; j++) { - if (board_infos[j].bus_num == pdata.bus_num) - pdata.max_chipselect++; - } - - if (!pdata.max_chipselect) - continue; - - ret = of_address_to_resource(np, 0, &res[0]); - if (ret) - goto err; - - ret = of_irq_to_resource(np, 0, &res[1]); - if (ret == NO_IRQ) - goto err; - - pdev = platform_device_alloc("mpc83xx_spi", i); - if (!pdev) - goto err; - - ret = platform_device_add_data(pdev, &pdata, sizeof(pdata)); - if (ret) - goto unreg; - - ret = platform_device_add_resources(pdev, res, - ARRAY_SIZE(res)); - if (ret) - goto unreg; - - ret = platform_device_add(pdev); - if (ret) - goto unreg; - - goto next; -unreg: - platform_device_del(pdev); -err: - pr_err("%s: registration failed\n", np->full_name); -next: - i++; - } - - return i; -} - -int __init fsl_spi_init(struct spi_board_info *board_infos, - unsigned int num_board_infos, - void (*cs_control)(struct spi_device *spi, bool on)) -{ - u32 sysclk = -1; - int ret; - -#ifdef CONFIG_QUICC_ENGINE - /* SPI controller is either clocked from QE or SoC clock */ - sysclk = get_brgfreq(); -#endif - if (sysclk == -1) { - sysclk = fsl_get_sys_freq(); - if (sysclk == -1) - return -ENODEV; - } - - ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos, - num_board_infos, cs_control); - if (!ret) - of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos, - num_board_infos, cs_control); - - return spi_register_board_info(board_infos, num_board_infos); -} - #if defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_86xx) static __be32 __iomem *rstcr; diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h index b5f3456780b8..42381bb6cd51 100644 --- a/arch/powerpc/sysdev/fsl_soc.h +++ b/arch/powerpc/sysdev/fsl_soc.h @@ -19,10 +19,6 @@ extern u32 fsl_get_sys_freq(void); struct spi_board_info; struct device_node; -extern int fsl_spi_init(struct spi_board_info *board_infos, - unsigned int num_board_infos, - void (*cs_control)(struct spi_device *spi, bool on)); - extern void fsl_rstcr_restart(char *cmd); #if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE) From 00fcf2cb6f6bb421851c3ba062c0a36760ea6e53 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Mar 2009 15:24:42 -0700 Subject: [PATCH 394/478] ecryptfs: use kzfree() Use kzfree() instead of memset() + kfree(). Signed-off-by: Johannes Weiner Reviewed-by: Pekka Enberg Acked-by: Tyler Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/keystore.c | 3 +-- fs/ecryptfs/messaging.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index e4a6223c3145..af737bb56cb7 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, out_release_free_unlock: crypto_free_hash(s->hash_desc.tfm); out_free_unlock: - memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); - kfree(s->block_aligned_filename); + kzfree(s->block_aligned_filename); out_unlock: mutex_unlock(s->tfm_mutex); out: diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 96ef51489e01..295e7fa56755 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon) if (daemon->user_ns) put_user_ns(daemon->user_ns); mutex_unlock(&daemon->mux); - memset(daemon, 0, sizeof(*daemon)); - kfree(daemon); + kzfree(daemon); out: return rc; } From 56fcef75117a153f298b3fe54af31053f53997dd Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 31 Mar 2009 15:24:43 -0700 Subject: [PATCH 395/478] autofs4: cleanup expire code duplication A significant portion of the autofs_dev_ioctl_expire() and autofs4_expire_multi() functions is duplicated code. This patch cleans that up. Signed-off-by: Ian Kent Signed-off-by: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 2 ++ fs/autofs4/dev-ioctl.c | 29 +---------------------------- fs/autofs4/expire.c | 27 +++++++++++++++++---------- 3 files changed, 20 insertions(+), 38 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a76803108d06..b7ff33c63101 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); int autofs4_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); struct dentry *autofs4_expire_direct(struct super_block *sb, diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 025e105bffea..9e5ae8a4f5c8 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - struct dentry *dentry; struct vfsmount *mnt; - int err = -EAGAIN; int how; how = param->expire.how; mnt = fp->f_path.mnt; - if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); - else - dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); - - if (dentry) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); - - /* - * This is synchronous because it makes the daemon a - * little easier - */ - err = autofs4_wait(sbi, dentry, NFY_EXPIRE); - - spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_MOUNTPOINT) { - ino->flags &= ~AUTOFS_INF_MOUNTPOINT; - sbi->sb->s_root->d_mounted++; - } - ino->flags &= ~AUTOFS_INF_EXPIRING; - complete_all(&ino->expire_complete); - spin_unlock(&sbi->fs_lock); - dput(dentry); - } - - return err; + return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index e3bd50776f9e..75f7ddacf7d6 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb, return ret; } -/* Call repeatedly until it returns -EAGAIN, meaning there's nothing - more to be done */ -int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int __user *arg) +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) { struct dentry *dentry; int ret = -EAGAIN; - int do_now = 0; - - if (arg && get_user(do_now, arg)) - return -EFAULT; if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); + dentry = autofs4_expire_direct(sb, mnt, sbi, when); else - dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); + dentry = autofs4_expire_indirect(sb, mnt, sbi, when); if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, return ret; } +/* Call repeatedly until it returns -EAGAIN, meaning there's nothing + more to be done */ +int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int __user *arg) +{ + int do_now = 0; + + if (arg && get_user(do_now, arg)) + return -EFAULT; + + return autofs4_do_expire_multi(sb, mnt, sbi, do_now); +} + From 8f63aaa8b9239475fc580d4450f1141496655305 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 31 Mar 2009 15:24:45 -0700 Subject: [PATCH 396/478] autofs4: fix lookup deadlock A deadlock can occur when user space uses a signal (autofs version 4 uses SIGCHLD for this) to effect expire completion. The order of events is: Expire process completes, but before being able to send SIGCHLD to it's parent ... Another process walks onto a different mount point and drops the directory inode semaphore prior to sending the request to the daemon as it must ... A third process does an lstat on on the expired mount point causing it to wait on expire completion (unfortunately) holding the directory semaphore. The mount request then arrives at the daemon which does an lstat and, deadlock. For some time I was concerned about releasing the directory semaphore around the expire wait in autofs4_lookup as well as for the mount call back. I finally realized that the last round of changes in this function made the expiring dentry and the lookup dentry separate and distinct so the check and possible wait can be done anywhere prior to the mount call back. This patch moves the check to just before the mount call back and inside the directory inode mutex release. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 74b1469a9504..e383bf0334f1 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); - expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name); - if (expiring) { - /* - * If we are racing with expire the request might not - * be quite complete but the directory has been removed - * so it must have been successful, so just wait for it. - */ - ino = autofs4_dentry_ino(expiring); - autofs4_expire_wait(expiring); - spin_lock(&sbi->lookup_lock); - if (!list_empty(&ino->expiring)) - list_del_init(&ino->expiring); - spin_unlock(&sbi->lookup_lock); - dput(expiring); - } - unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); if (unhashed) dentry = unhashed; @@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s } if (!oz_mode) { + mutex_unlock(&dir->i_mutex); + expiring = autofs4_lookup_expiring(sbi, + dentry->d_parent, + &dentry->d_name); + if (expiring) { + /* + * If we are racing with expire the request might not + * be quite complete but the directory has been removed + * so it must have been successful, so just wait for it. + */ + ino = autofs4_dentry_ino(expiring); + autofs4_expire_wait(expiring); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + dput(expiring); + } + spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - if (dentry->d_op && dentry->d_op->d_revalidate) { - mutex_unlock(&dir->i_mutex); + if (dentry->d_op && dentry->d_op->d_revalidate) (dentry->d_op->d_revalidate)(dentry, nd); - mutex_lock(&dir->i_mutex); - } + mutex_lock(&dir->i_mutex); } /* From 79955898f961a870cbcc58f6ae13f3741a909da5 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 31 Mar 2009 15:24:45 -0700 Subject: [PATCH 397/478] autofs4: fix kernel includes autofs_dev-ioctl.h is included by both the kernel module and user space tools and it includes two kernel header files. Compiles work if the kernel headers are installed but fail otherwise. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/auto_dev-ioctl.h | 7 ++++++- include/linux/auto_fs.h | 8 +++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h index 91a773993a5c..850f39b33e74 100644 --- a/include/linux/auto_dev-ioctl.h +++ b/include/linux/auto_dev-ioctl.h @@ -10,8 +10,13 @@ #ifndef _LINUX_AUTO_DEV_IOCTL_H #define _LINUX_AUTO_DEV_IOCTL_H +#include + +#ifdef __KERNEL__ #include -#include +#else +#include +#endif /* __KERNEL__ */ #define AUTOFS_DEVICE_NAME "autofs" diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h index c21e5972a3e8..63265852b7d1 100644 --- a/include/linux/auto_fs.h +++ b/include/linux/auto_fs.h @@ -17,10 +17,12 @@ #ifdef __KERNEL__ #include #include -#include -#endif /* __KERNEL__ */ - +#include #include +#else +#include +#include +#endif /* __KERNEL__ */ /* This file describes autofs v3 */ #define AUTOFS_PROTO_VERSION 3 From 47367a3ba425d70467af0009782098235ddbf204 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 31 Mar 2009 15:24:46 -0700 Subject: [PATCH 398/478] rtc: convert wm8350 use new alarm and update operations These are the only two ioctls so the ioctl() function is also removed. Signed-off-by: Mark Brown Cc: Acked-by: Alessandro Zummo Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-wm8350.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c index 5c5e3aa91385..616630d1e317 100644 --- a/drivers/rtc/rtc-wm8350.c +++ b/drivers/rtc/rtc-wm8350.c @@ -236,6 +236,17 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) return 0; } +static int wm8350_rtc_alarm_irq_enable(struct device *dev, + unsigned int enabled) +{ + struct wm8350 *wm8350 = dev_get_drvdata(dev); + + if (enabled) + return wm8350_rtc_start_alarm(wm8350); + else + return wm8350_rtc_stop_alarm(wm8350); +} + static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) { struct wm8350 *wm8350 = dev_get_drvdata(dev); @@ -291,30 +302,15 @@ static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) return ret; } -/* - * Handle commands from user-space - */ -static int wm8350_rtc_ioctl(struct device *dev, unsigned int cmd, - unsigned long arg) +static int wm8350_rtc_update_irq_enable(struct device *dev, + unsigned int enabled) { struct wm8350 *wm8350 = dev_get_drvdata(dev); - switch (cmd) { - case RTC_AIE_OFF: - return wm8350_rtc_stop_alarm(wm8350); - case RTC_AIE_ON: - return wm8350_rtc_start_alarm(wm8350); - - case RTC_UIE_OFF: - wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC); - break; - case RTC_UIE_ON: + if (enabled) wm8350_unmask_irq(wm8350, WM8350_IRQ_RTC_SEC); - break; - - default: - return -ENOIOCTLCMD; - } + else + wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC); return 0; } @@ -345,11 +341,12 @@ static void wm8350_rtc_update_handler(struct wm8350 *wm8350, int irq, } static const struct rtc_class_ops wm8350_rtc_ops = { - .ioctl = wm8350_rtc_ioctl, .read_time = wm8350_rtc_readtime, .set_time = wm8350_rtc_settime, .read_alarm = wm8350_rtc_readalarm, .set_alarm = wm8350_rtc_setalarm, + .alarm_irq_enable = wm8350_rtc_alarm_irq_enable, + .update_irq_enable = wm8350_rtc_update_irq_enable, }; #ifdef CONFIG_PM From 78d89ef40c2ff7265df077e20c4d76be7d415204 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Mar 2009 15:24:48 -0700 Subject: [PATCH 399/478] rtc: convert LEAP_YEAR into an inline - the LEAP_YEAR macro is buggy - it references its arg multiple times. Fix this by turning it into a C function. - give it a more approriate name - Move it to rtc.h so that other .c files can use it, instead of copying it. Cc: dann frazier Acked-by: Alessandro Zummo Cc: stephane eranian Cc: "Luck, Tony" Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-lib.c | 7 +++---- include/linux/rtc.h | 6 ++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c index dd70bf73ce9d..773851f338b8 100644 --- a/drivers/rtc/rtc-lib.c +++ b/drivers/rtc/rtc-lib.c @@ -26,14 +26,13 @@ static const unsigned short rtc_ydays[2][13] = { }; #define LEAPS_THRU_END_OF(y) ((y)/4 - (y)/100 + (y)/400) -#define LEAP_YEAR(year) ((!(year % 4) && (year % 100)) || !(year % 400)) /* * The number of days in the month. */ int rtc_month_days(unsigned int month, unsigned int year) { - return rtc_days_in_month[month] + (LEAP_YEAR(year) && month == 1); + return rtc_days_in_month[month] + (is_leap_year(year) && month == 1); } EXPORT_SYMBOL(rtc_month_days); @@ -42,7 +41,7 @@ EXPORT_SYMBOL(rtc_month_days); */ int rtc_year_days(unsigned int day, unsigned int month, unsigned int year) { - return rtc_ydays[LEAP_YEAR(year)][month] + day-1; + return rtc_ydays[is_leap_year(year)][month] + day-1; } EXPORT_SYMBOL(rtc_year_days); @@ -66,7 +65,7 @@ void rtc_time_to_tm(unsigned long time, struct rtc_time *tm) - LEAPS_THRU_END_OF(1970 - 1); if (days < 0) { year -= 1; - days += 365 + LEAP_YEAR(year); + days += 365 + is_leap_year(year); } tm->tm_year = year - 1900; tm->tm_yday = days + 1; diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 4046b75563c1..60f88a7fb13d 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -99,6 +99,7 @@ struct rtc_pll_info { #ifdef __KERNEL__ +#include #include extern int rtc_month_days(unsigned int month, unsigned int year); @@ -232,6 +233,11 @@ int rtc_register(rtc_task_t *task); int rtc_unregister(rtc_task_t *task); int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg); +static inline bool is_leap_year(unsigned int year) +{ + return (!(year % 4) && (year % 100)) || !(year % 400); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_RTC_H_ */ From 5e3fd9e5810f141c9c70c36992d4ed72b3aa1fed Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:48 -0700 Subject: [PATCH 400/478] rtc: add platform driver for EFI Munge Stephane Eranian's efirtc.c code into an rtc platform driver [akpm@linux-foundation.org: use is_leap_year()] Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: stephane eranian Cc: "Luck, Tony" Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/time.c | 16 +++ drivers/rtc/Kconfig | 10 ++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-efi.c | 235 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 262 insertions(+) create mode 100644 drivers/rtc/rtc-efi.c diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index f0ebb342409d..d6747bae52d8 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -405,6 +406,21 @@ static struct irqaction timer_irqaction = { .name = "timer" }; +static struct platform_device rtc_efi_dev = { + .name = "rtc-efi", + .id = -1, +}; + +static int __init rtc_init(void) +{ + if (platform_device_register(&rtc_efi_dev) < 0) + printk(KERN_ERR "unable to register rtc device...\n"); + + /* not necessarily an error */ + return 0; +} +module_init(rtc_init); + void __init time_init (void) { diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 81450fbd8b12..d669b9169278 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -440,6 +440,16 @@ config RTC_DRV_DS1742 This driver can also be built as a module. If so, the module will be called rtc-ds1742. +config RTC_DRV_EFI + tristate "EFI RTC" + depends on IA64 + help + If you say yes here you will get support for the EFI + Real Time Clock. + + This driver can also be built as a module. If so, the module + will be called rtc-efi. + config RTC_DRV_STK17TA8 tristate "Simtek STK17TA8" depends on RTC_CLASS diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 0e697aa51caa..e7b09986d26e 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_RTC_DRV_DS1553) += rtc-ds1553.o obj-$(CONFIG_RTC_DRV_DS1672) += rtc-ds1672.o obj-$(CONFIG_RTC_DRV_DS1742) += rtc-ds1742.o obj-$(CONFIG_RTC_DRV_DS3234) += rtc-ds3234.o +obj-$(CONFIG_RTC_DRV_EFI) += rtc-efi.o obj-$(CONFIG_RTC_DRV_EP93XX) += rtc-ep93xx.o obj-$(CONFIG_RTC_DRV_FM3130) += rtc-fm3130.o obj-$(CONFIG_RTC_DRV_ISL1208) += rtc-isl1208.o diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c new file mode 100644 index 000000000000..550292304b0f --- /dev/null +++ b/drivers/rtc/rtc-efi.c @@ -0,0 +1,235 @@ +/* + * rtc-efi: RTC Class Driver for EFI-based systems + * + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * + * Author: dann frazier + * Based on efirtc.c by Stephane Eranian + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +#define EFI_ISDST (EFI_TIME_ADJUST_DAYLIGHT|EFI_TIME_IN_DAYLIGHT) +/* + * EFI Epoch is 1/1/1998 + */ +#define EFI_RTC_EPOCH 1998 + +/* + * returns day of the year [0-365] + */ +static inline int +compute_yday(efi_time_t *eft) +{ + /* efi_time_t.month is in the [1-12] so, we need -1 */ + return rtc_year_days(eft->day - 1, eft->month - 1, eft->year); +} +/* + * returns day of the week [0-6] 0=Sunday + * + * Don't try to provide a year that's before 1998, please ! + */ +static int +compute_wday(efi_time_t *eft) +{ + int y; + int ndays = 0; + + if (eft->year < 1998) { + printk(KERN_ERR "efirtc: EFI year < 1998, invalid date\n"); + return -1; + } + + for (y = EFI_RTC_EPOCH; y < eft->year; y++) + ndays += 365 + (is_leap_year(y) ? 1 : 0); + + ndays += compute_yday(eft); + + /* + * 4=1/1/1998 was a Thursday + */ + return (ndays + 4) % 7; +} + +static void +convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft) +{ + eft->year = wtime->tm_year + 1900; + eft->month = wtime->tm_mon + 1; + eft->day = wtime->tm_mday; + eft->hour = wtime->tm_hour; + eft->minute = wtime->tm_min; + eft->second = wtime->tm_sec; + eft->nanosecond = 0; + eft->daylight = wtime->tm_isdst ? EFI_ISDST : 0; + eft->timezone = EFI_UNSPECIFIED_TIMEZONE; +} + +static void +convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime) +{ + memset(wtime, 0, sizeof(*wtime)); + wtime->tm_sec = eft->second; + wtime->tm_min = eft->minute; + wtime->tm_hour = eft->hour; + wtime->tm_mday = eft->day; + wtime->tm_mon = eft->month - 1; + wtime->tm_year = eft->year - 1900; + + /* day of the week [0-6], Sunday=0 */ + wtime->tm_wday = compute_wday(eft); + + /* day in the year [1-365]*/ + wtime->tm_yday = compute_yday(eft); + + + switch (eft->daylight & EFI_ISDST) { + case EFI_ISDST: + wtime->tm_isdst = 1; + break; + case EFI_TIME_ADJUST_DAYLIGHT: + wtime->tm_isdst = 0; + break; + default: + wtime->tm_isdst = -1; + } +} + +static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) +{ + efi_time_t eft; + efi_status_t status; + + /* + * As of EFI v1.10, this call always returns an unsupported status + */ + status = efi.get_wakeup_time((efi_bool_t *)&wkalrm->enabled, + (efi_bool_t *)&wkalrm->pending, &eft); + + if (status != EFI_SUCCESS) + return -EINVAL; + + convert_from_efi_time(&eft, &wkalrm->time); + + return rtc_valid_tm(&wkalrm->time); +} + +static int efi_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) +{ + efi_time_t eft; + efi_status_t status; + + convert_to_efi_time(&wkalrm->time, &eft); + + /* + * XXX Fixme: + * As of EFI 0.92 with the firmware I have on my + * machine this call does not seem to work quite + * right + * + * As of v1.10, this call always returns an unsupported status + */ + status = efi.set_wakeup_time((efi_bool_t)wkalrm->enabled, &eft); + + printk(KERN_WARNING "write status is %d\n", (int)status); + + return status == EFI_SUCCESS ? 0 : -EINVAL; +} + +static int efi_read_time(struct device *dev, struct rtc_time *tm) +{ + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = efi.get_time(&eft, &cap); + + if (status != EFI_SUCCESS) { + /* should never happen */ + printk(KERN_ERR "efitime: can't read time\n"); + return -EINVAL; + } + + convert_from_efi_time(&eft, tm); + + return rtc_valid_tm(tm); +} + +static int efi_set_time(struct device *dev, struct rtc_time *tm) +{ + efi_status_t status; + efi_time_t eft; + + convert_to_efi_time(tm, &eft); + + status = efi.set_time(&eft); + + return status == EFI_SUCCESS ? 0 : -EINVAL; +} + +static const struct rtc_class_ops efi_rtc_ops = { + .read_time = efi_read_time, + .set_time = efi_set_time, + .read_alarm = efi_read_alarm, + .set_alarm = efi_set_alarm, +}; + +static int __init efi_rtc_probe(struct platform_device *dev) +{ + struct rtc_device *rtc; + + rtc = rtc_device_register("rtc-efi", &dev->dev, &efi_rtc_ops, + THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + platform_set_drvdata(dev, rtc); + + return 0; +} + +static int __exit efi_rtc_remove(struct platform_device *dev) +{ + struct rtc_device *rtc = platform_get_drvdata(dev); + + rtc_device_unregister(rtc); + + return 0; +} + +static struct platform_driver efi_rtc_driver = { + .driver = { + .name = "rtc-efi", + .owner = THIS_MODULE, + }, + .probe = efi_rtc_probe, + .remove = __exit_p(efi_rtc_remove), +}; + +static int __init efi_rtc_init(void) +{ + return platform_driver_probe(&efi_rtc_driver, efi_rtc_probe); +} + +static void __exit efi_rtc_exit(void) +{ + platform_driver_unregister(&efi_rtc_driver); +} + +module_init(efi_rtc_init); +module_exit(efi_rtc_exit); + +MODULE_AUTHOR("dann frazier "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("EFI RTC driver"); From 93d456d9802a40859ecc3d67be8c759b03aa487d Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:49 -0700 Subject: [PATCH 401/478] rtc-parisc: add a missing include for linux/rtc.h Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-parisc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index c6bfa6fe1a2a..319bb5d445ea 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -7,6 +7,7 @@ #include #include #include +#include #include From 05439f1f89aebbdb791c49e980f0f31652e4055b Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:50 -0700 Subject: [PATCH 402/478] rtc-parisc: remove redundant locking The RTC subsystem proides ops locking, no need to implement our own Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-parisc.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index 319bb5d445ea..cb087ad407f6 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -14,17 +14,13 @@ /* as simple as can be, and no simpler. */ struct parisc_rtc { struct rtc_device *rtc; - spinlock_t lock; }; static int parisc_get_time(struct device *dev, struct rtc_time *tm) { - struct parisc_rtc *p = dev_get_drvdata(dev); - unsigned long flags, ret; + unsigned long ret; - spin_lock_irqsave(&p->lock, flags); ret = get_rtc_time(tm); - spin_unlock_irqrestore(&p->lock, flags); if (ret & RTC_BATT_BAD) return -EOPNOTSUPP; @@ -34,13 +30,9 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm) static int parisc_set_time(struct device *dev, struct rtc_time *tm) { - struct parisc_rtc *p = dev_get_drvdata(dev); - unsigned long flags; int ret; - spin_lock_irqsave(&p->lock, flags); ret = set_rtc_time(tm); - spin_unlock_irqrestore(&p->lock, flags); if (ret < 0) return -EOPNOTSUPP; @@ -61,8 +53,6 @@ static int __devinit parisc_rtc_probe(struct platform_device *dev) if (!p) return -ENOMEM; - spin_lock_init(&p->lock); - p->rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, THIS_MODULE); if (IS_ERR(p->rtc)) { From 6b318f66dca829a72c974083cc10fd5556eec6f1 Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:51 -0700 Subject: [PATCH 403/478] rtc-parisc: remove struct parisc_rtc parisc_rtc now only includes an rtc_device pointer, so let's just use the rtc_device type directly. Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-parisc.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index cb087ad407f6..ee4e9a3fb583 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -11,11 +11,6 @@ #include -/* as simple as can be, and no simpler. */ -struct parisc_rtc { - struct rtc_device *rtc; -}; - static int parisc_get_time(struct device *dev, struct rtc_time *tm) { unsigned long ret; @@ -47,16 +42,16 @@ static const struct rtc_class_ops parisc_rtc_ops = { static int __devinit parisc_rtc_probe(struct platform_device *dev) { - struct parisc_rtc *p; + struct rtc_device *p; p = kzalloc(sizeof (*p), GFP_KERNEL); if (!p) return -ENOMEM; - p->rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, - THIS_MODULE); - if (IS_ERR(p->rtc)) { - int err = PTR_ERR(p->rtc); + p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, + THIS_MODULE); + if (IS_ERR(p)) { + int err = PTR_ERR(p); kfree(p); return err; } @@ -68,9 +63,9 @@ static int __devinit parisc_rtc_probe(struct platform_device *dev) static int __devexit parisc_rtc_remove(struct platform_device *dev) { - struct parisc_rtc *p = platform_get_drvdata(dev); + struct rtc_device *p = platform_get_drvdata(dev); - rtc_device_unregister(p->rtc); + rtc_device_unregister(p); kfree(p); return 0; From f62bacd4d48a1a6b8931a0140fb2324a06dd89fe Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:52 -0700 Subject: [PATCH 404/478] rtc-parisc: use rtc_valid_tm() in parisc_get_time Use the return value of rtc_valid_tm() instead of just returning 0. Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-parisc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index ee4e9a3fb583..0477cc129c6d 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -20,7 +20,7 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm) if (ret & RTC_BATT_BAD) return -EOPNOTSUPP; - return 0; + return rtc_valid_tm(tm); } static int parisc_set_time(struct device *dev, struct rtc_time *tm) From 2b93cff4dc184bf7b4858dc7a9bd2e8d33c1a3eb Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:52 -0700 Subject: [PATCH 405/478] rtc-parisc: use platform_driver_probe This isn't a hotpluggable device, so call platform_driver_probe directly in parisc_rtc_init Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-parisc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index 0477cc129c6d..a2ca07ad42cb 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -40,19 +40,14 @@ static const struct rtc_class_ops parisc_rtc_ops = { .set_time = parisc_set_time, }; -static int __devinit parisc_rtc_probe(struct platform_device *dev) +static int __init parisc_rtc_probe(struct platform_device *dev) { struct rtc_device *p; - p = kzalloc(sizeof (*p), GFP_KERNEL); - if (!p) - return -ENOMEM; - p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, THIS_MODULE); if (IS_ERR(p)) { int err = PTR_ERR(p); - kfree(p); return err; } @@ -61,12 +56,11 @@ static int __devinit parisc_rtc_probe(struct platform_device *dev) return 0; } -static int __devexit parisc_rtc_remove(struct platform_device *dev) +static int __exit parisc_rtc_remove(struct platform_device *dev) { struct rtc_device *p = platform_get_drvdata(dev); rtc_device_unregister(p); - kfree(p); return 0; } @@ -82,7 +76,7 @@ static struct platform_driver parisc_rtc_driver = { static int __init parisc_rtc_init(void) { - return platform_driver_register(&parisc_rtc_driver); + return platform_driver_probe(&parisc_rtc_driver, parisc_rtc_probe); } static void __exit parisc_rtc_fini(void) From d09c091b6a8b2b73381e514d68c73580f2294b03 Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:53 -0700 Subject: [PATCH 406/478] rtc-parisc: declare rtc_parisc_dev as static Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 9d46c43a4152..a479d08d5ed3 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -216,7 +216,7 @@ void __init start_cpu_itimer(void) per_cpu(cpu_data, cpu).it_value = next_tick; } -struct platform_device rtc_parisc_dev = { +static struct platform_device rtc_parisc_dev = { .name = "rtc-parisc", .id = -1, }; From cd875d4767f821dabd0feb668623a42e9d48158a Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:54 -0700 Subject: [PATCH 407/478] rtc-parisc: remove unnecessary ret variable Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/kernel/time.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index a479d08d5ed3..e75cae6072c5 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -223,10 +223,7 @@ static struct platform_device rtc_parisc_dev = { static int __init rtc_init(void) { - int ret; - - ret = platform_device_register(&rtc_parisc_dev); - if (ret < 0) + if (platform_device_register(&rtc_parisc_dev) < 0) printk(KERN_ERR "unable to register rtc device...\n"); /* not necessarily an error */ From a8c20cd3f7e2e223898c53adfb74420db5d9ac47 Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:55 -0700 Subject: [PATCH 408/478] rtc-parisc: remove a couple unnecessary variables Signed-off-by: dann frazier Reviewed-by: Grant Grundler Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-parisc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index a2ca07ad42cb..c29e91814c9a 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -25,11 +25,7 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm) static int parisc_set_time(struct device *dev, struct rtc_time *tm) { - int ret; - - ret = set_rtc_time(tm); - - if (ret < 0) + if (set_rtc_time(tm) < 0) return -EOPNOTSUPP; return 0; @@ -46,10 +42,8 @@ static int __init parisc_rtc_probe(struct platform_device *dev) p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, THIS_MODULE); - if (IS_ERR(p)) { - int err = PTR_ERR(p); - return err; - } + if (IS_ERR(p)) + return PTR_ERR(p); platform_set_drvdata(dev, p); From b250c96ea9d7bc0b9ac3ff6e878b254b0b0b6abc Mon Sep 17 00:00:00 2001 From: dann frazier Date: Tue, 31 Mar 2009 15:24:55 -0700 Subject: [PATCH 409/478] rtc-parisc: rename p pointer to rtc Signed-off-by: dann frazier Cc: Alessandro Zummo Cc: Kyle McMartin Cc: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-parisc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index c29e91814c9a..b966f56da976 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -38,23 +38,23 @@ static const struct rtc_class_ops parisc_rtc_ops = { static int __init parisc_rtc_probe(struct platform_device *dev) { - struct rtc_device *p; + struct rtc_device *rtc; - p = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, - THIS_MODULE); - if (IS_ERR(p)) - return PTR_ERR(p); + rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, + THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); - platform_set_drvdata(dev, p); + platform_set_drvdata(dev, rtc); return 0; } static int __exit parisc_rtc_remove(struct platform_device *dev) { - struct rtc_device *p = platform_get_drvdata(dev); + struct rtc_device *rtc = platform_get_drvdata(dev); - rtc_device_unregister(p); + rtc_device_unregister(rtc); return 0; } From 30e7b039b1f9a6d5a4e50df5469a4f347ea1aa77 Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Tue, 31 Mar 2009 15:24:56 -0700 Subject: [PATCH 410/478] rtc-ds1307: true SMBus compatibility Allow the rtc-ds1307 driver to work with SMBus controllers like nforce2 that do not support i2c block transfers. Signed-off-by: Ed Swierk Acked-by: Jean Delvare Acked-by: David Brownell Cc: Alessandro Zummo Cc: BARRE Sebastien Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1307.c | 109 ++++++++++++++++++++++++++++++++++----- 1 file changed, 97 insertions(+), 12 deletions(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 7e5155e88ac7..1c975e6439b3 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -94,6 +94,10 @@ struct ds1307 { struct i2c_client *client; struct rtc_device *rtc; struct work_struct work; + s32 (*read_block_data)(struct i2c_client *client, u8 command, + u8 length, u8 *values); + s32 (*write_block_data)(struct i2c_client *client, u8 command, + u8 length, const u8 *values); }; struct chip_desc { @@ -132,6 +136,79 @@ MODULE_DEVICE_TABLE(i2c, ds1307_id); /*----------------------------------------------------------------------*/ +#define BLOCK_DATA_MAX_TRIES 10 + +static s32 ds1307_read_block_data_once(struct i2c_client *client, u8 command, + u8 length, u8 *values) +{ + s32 i, data; + + for (i = 0; i < length; i++) { + data = i2c_smbus_read_byte_data(client, command + i); + if (data < 0) + return data; + values[i] = data; + } + return i; +} + +static s32 ds1307_read_block_data(struct i2c_client *client, u8 command, + u8 length, u8 *values) +{ + u8 oldvalues[I2C_SMBUS_BLOCK_MAX]; + s32 ret; + int tries = 0; + + dev_dbg(&client->dev, "ds1307_read_block_data (length=%d)\n", length); + ret = ds1307_read_block_data_once(client, command, length, values); + if (ret < 0) + return ret; + do { + if (++tries > BLOCK_DATA_MAX_TRIES) { + dev_err(&client->dev, + "ds1307_read_block_data failed\n"); + return -EIO; + } + memcpy(oldvalues, values, length); + ret = ds1307_read_block_data_once(client, command, length, + values); + if (ret < 0) + return ret; + } while (memcmp(oldvalues, values, length)); + return length; +} + +static s32 ds1307_write_block_data(struct i2c_client *client, u8 command, + u8 length, const u8 *values) +{ + u8 currvalues[I2C_SMBUS_BLOCK_MAX]; + int tries = 0; + + dev_dbg(&client->dev, "ds1307_write_block_data (length=%d)\n", length); + do { + s32 i, ret; + + if (++tries > BLOCK_DATA_MAX_TRIES) { + dev_err(&client->dev, + "ds1307_write_block_data failed\n"); + return -EIO; + } + for (i = 0; i < length; i++) { + ret = i2c_smbus_write_byte_data(client, command + i, + values[i]); + if (ret < 0) + return ret; + } + ret = ds1307_read_block_data_once(client, command, length, + currvalues); + if (ret < 0) + return ret; + } while (memcmp(currvalues, values, length)); + return length; +} + +/*----------------------------------------------------------------------*/ + /* * The IRQ logic includes a "real" handler running in IRQ context just * long enough to schedule this workqueue entry. We need a task context @@ -202,7 +279,7 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t) int tmp; /* read the RTC date and time registers all at once */ - tmp = i2c_smbus_read_i2c_block_data(ds1307->client, + tmp = ds1307->read_block_data(ds1307->client, DS1307_REG_SECS, 7, ds1307->regs); if (tmp != 7) { dev_err(dev, "%s error %d\n", "read", tmp); @@ -279,7 +356,7 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t) "write", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6]); - result = i2c_smbus_write_i2c_block_data(ds1307->client, 0, 7, buf); + result = ds1307->write_block_data(ds1307->client, 0, 7, buf); if (result < 0) { dev_err(dev, "%s error %d\n", "write", result); return result; @@ -297,7 +374,7 @@ static int ds1337_read_alarm(struct device *dev, struct rtc_wkalrm *t) return -EINVAL; /* read all ALARM1, ALARM2, and status registers at once */ - ret = i2c_smbus_read_i2c_block_data(client, + ret = ds1307->read_block_data(client, DS1339_REG_ALARM1_SECS, 9, ds1307->regs); if (ret != 9) { dev_err(dev, "%s error %d\n", "alarm read", ret); @@ -356,7 +433,7 @@ static int ds1337_set_alarm(struct device *dev, struct rtc_wkalrm *t) t->enabled, t->pending); /* read current status of both alarms and the chip */ - ret = i2c_smbus_read_i2c_block_data(client, + ret = ds1307->read_block_data(client, DS1339_REG_ALARM1_SECS, 9, buf); if (ret != 9) { dev_err(dev, "%s error %d\n", "alarm write", ret); @@ -391,7 +468,7 @@ static int ds1337_set_alarm(struct device *dev, struct rtc_wkalrm *t) } buf[8] = status & ~(DS1337_BIT_A1I | DS1337_BIT_A2I); - ret = i2c_smbus_write_i2c_block_data(client, + ret = ds1307->write_block_data(client, DS1339_REG_ALARM1_SECS, 9, buf); if (ret < 0) { dev_err(dev, "can't set alarm time\n"); @@ -479,7 +556,7 @@ ds1307_nvram_read(struct kobject *kobj, struct bin_attribute *attr, if (unlikely(!count)) return count; - result = i2c_smbus_read_i2c_block_data(client, 8 + off, count, buf); + result = ds1307->read_block_data(client, 8 + off, count, buf); if (result < 0) dev_err(&client->dev, "%s error %d\n", "nvram read", result); return result; @@ -490,9 +567,11 @@ ds1307_nvram_write(struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct i2c_client *client; + struct ds1307 *ds1307; int result; client = kobj_to_i2c_client(kobj); + ds1307 = i2c_get_clientdata(client); if (unlikely(off >= NVRAM_SIZE)) return -EFBIG; @@ -501,7 +580,7 @@ ds1307_nvram_write(struct kobject *kobj, struct bin_attribute *attr, if (unlikely(!count)) return count; - result = i2c_smbus_write_i2c_block_data(client, 8 + off, count, buf); + result = ds1307->write_block_data(client, 8 + off, count, buf); if (result < 0) { dev_err(&client->dev, "%s error %d\n", "nvram write", result); return result; @@ -535,9 +614,8 @@ static int __devinit ds1307_probe(struct i2c_client *client, int want_irq = false; unsigned char *buf; - if (!i2c_check_functionality(adapter, - I2C_FUNC_SMBUS_WRITE_BYTE_DATA | - I2C_FUNC_SMBUS_I2C_BLOCK)) + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA) + && !i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) return -EIO; if (!(ds1307 = kzalloc(sizeof(struct ds1307), GFP_KERNEL))) @@ -547,6 +625,13 @@ static int __devinit ds1307_probe(struct i2c_client *client, i2c_set_clientdata(client, ds1307); ds1307->type = id->driver_data; buf = ds1307->regs; + if (i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) { + ds1307->read_block_data = i2c_smbus_read_i2c_block_data; + ds1307->write_block_data = i2c_smbus_write_i2c_block_data; + } else { + ds1307->read_block_data = ds1307_read_block_data; + ds1307->write_block_data = ds1307_write_block_data; + } switch (ds1307->type) { case ds_1337: @@ -557,7 +642,7 @@ static int __devinit ds1307_probe(struct i2c_client *client, want_irq = true; } /* get registers that the "rtc" read below won't read... */ - tmp = i2c_smbus_read_i2c_block_data(ds1307->client, + tmp = ds1307->read_block_data(ds1307->client, DS1337_REG_CONTROL, 2, buf); if (tmp != 2) { pr_debug("read error %d\n", tmp); @@ -595,7 +680,7 @@ static int __devinit ds1307_probe(struct i2c_client *client, read_rtc: /* read RTC registers */ - tmp = i2c_smbus_read_i2c_block_data(ds1307->client, 0, 8, buf); + tmp = ds1307->read_block_data(ds1307->client, 0, 8, buf); if (tmp != 8) { pr_debug("read error %d\n", tmp); err = -EIO; From a216685818a54b4f15235068b53908f954850251 Mon Sep 17 00:00:00 2001 From: Matthias Fuchs Date: Tue, 31 Mar 2009 15:24:58 -0700 Subject: [PATCH 411/478] rtc: add EPSON RX8025 support to DS1307 RTC driver Add support for the EPSON RX8025 RTC. The date/time registers of this chip are compatible with the DS1307. Signed-off-by: Matthias Fuchs Signed-off-by: Alessandro Zummo Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/Kconfig | 7 ++-- drivers/rtc/rtc-ds1307.c | 80 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index d669b9169278..09d5cd33a3f6 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -129,13 +129,14 @@ comment "I2C RTC drivers" if I2C config RTC_DRV_DS1307 - tristate "Dallas/Maxim DS1307/37/38/39/40, ST M41T00" + tristate "Dallas/Maxim DS1307/37/38/39/40, ST M41T00, EPSON RX-8025" help If you say yes here you get support for various compatible RTC chips (often with battery backup) connected with I2C. This driver should handle DS1307, DS1337, DS1338, DS1339, DS1340, ST M41T00, - and probably other chips. In some cases the RTC must already - have been initialized (by manufacturing or a bootloader). + EPSON RX-8025 and probably other chips. In some cases the RTC + must already have been initialized (by manufacturing or a + bootloader). The first seven registers on these chips hold an RTC, and other registers may add features such as NVRAM, a trickle charger for diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 1c975e6439b3..2c4a65302a9d 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -3,6 +3,7 @@ * * Copyright (C) 2005 James Chapman (ds1337 core) * Copyright (C) 2006 David Brownell + * Copyright (C) 2009 Matthias Fuchs (rx8025 support) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,6 +32,7 @@ enum ds_type { ds_1339, ds_1340, m41t00, + rx_8025, // rs5c372 too? different address... }; @@ -83,6 +85,12 @@ enum ds_type { #define DS1339_REG_ALARM1_SECS 0x07 #define DS1339_REG_TRICKLE 0x10 +#define RX8025_REG_CTRL1 0x0e +# define RX8025_BIT_2412 0x20 +#define RX8025_REG_CTRL2 0x0f +# define RX8025_BIT_PON 0x10 +# define RX8025_BIT_VDET 0x40 +# define RX8025_BIT_XST 0x20 struct ds1307 { @@ -121,6 +129,8 @@ static const struct chip_desc chips[] = { [ds_1340] = { }, [m41t00] = { +}, +[rx_8025] = { }, }; static const struct i2c_device_id ds1307_id[] = { @@ -130,6 +140,7 @@ static const struct i2c_device_id ds1307_id[] = { { "ds1339", ds_1339 }, { "ds1340", ds_1340 }, { "m41t00", m41t00 }, + { "rx8025", rx_8025 }, { } }; MODULE_DEVICE_TABLE(i2c, ds1307_id); @@ -674,6 +685,72 @@ static int __devinit ds1307_probe(struct i2c_client *client, dev_warn(&client->dev, "SET TIME!\n"); } break; + + case rx_8025: + tmp = i2c_smbus_read_i2c_block_data(ds1307->client, + RX8025_REG_CTRL1 << 4 | 0x08, 2, buf); + if (tmp != 2) { + pr_debug("read error %d\n", tmp); + err = -EIO; + goto exit_free; + } + + /* oscillator off? turn it on, so clock can tick. */ + if (!(ds1307->regs[1] & RX8025_BIT_XST)) { + ds1307->regs[1] |= RX8025_BIT_XST; + i2c_smbus_write_byte_data(client, + RX8025_REG_CTRL2 << 4 | 0x08, + ds1307->regs[1]); + dev_warn(&client->dev, + "oscillator stop detected - SET TIME!\n"); + } + + if (ds1307->regs[1] & RX8025_BIT_PON) { + ds1307->regs[1] &= ~RX8025_BIT_PON; + i2c_smbus_write_byte_data(client, + RX8025_REG_CTRL2 << 4 | 0x08, + ds1307->regs[1]); + dev_warn(&client->dev, "power-on detected\n"); + } + + if (ds1307->regs[1] & RX8025_BIT_VDET) { + ds1307->regs[1] &= ~RX8025_BIT_VDET; + i2c_smbus_write_byte_data(client, + RX8025_REG_CTRL2 << 4 | 0x08, + ds1307->regs[1]); + dev_warn(&client->dev, "voltage drop detected\n"); + } + + /* make sure we are running in 24hour mode */ + if (!(ds1307->regs[0] & RX8025_BIT_2412)) { + u8 hour; + + /* switch to 24 hour mode */ + i2c_smbus_write_byte_data(client, + RX8025_REG_CTRL1 << 4 | 0x08, + ds1307->regs[0] | + RX8025_BIT_2412); + + tmp = i2c_smbus_read_i2c_block_data(ds1307->client, + RX8025_REG_CTRL1 << 4 | 0x08, 2, buf); + if (tmp != 2) { + pr_debug("read error %d\n", tmp); + err = -EIO; + goto exit_free; + } + + /* correct hour */ + hour = bcd2bin(ds1307->regs[DS1307_REG_HOUR]); + if (hour == 12) + hour = 0; + if (ds1307->regs[DS1307_REG_HOUR] & DS1307_BIT_PM) + hour += 12; + + i2c_smbus_write_byte_data(client, + DS1307_REG_HOUR << 4 | 0x08, + hour); + } + break; default: break; } @@ -734,6 +811,7 @@ read_rtc: dev_warn(&client->dev, "SET TIME!\n"); } break; + case rx_8025: case ds_1337: case ds_1339: break; @@ -747,6 +825,8 @@ read_rtc: * systems that will run through year 2100. */ break; + case rx_8025: + break; default: if (!(tmp & DS1307_BIT_12HR)) break; From 62da659a7057f7227a99a42eea6aa606b09c1e8c Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:24:59 -0700 Subject: [PATCH 412/478] rtc-wm8350: retries will reach -1 With a postfix decrement retries will reach -1 rather than 0, so the warning and error-out will not occur. Signed-off-by: Roel Kluin Acked-by: Mark Brown Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-wm8350.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c index 616630d1e317..c91edc572eb6 100644 --- a/drivers/rtc/rtc-wm8350.c +++ b/drivers/rtc/rtc-wm8350.c @@ -122,7 +122,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) do { rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); schedule_timeout_uninterruptible(msecs_to_jiffies(1)); - } while (retries-- && !(rtc_ctrl & WM8350_RTC_STS)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); if (!retries) { dev_err(dev, "timed out on set confirmation\n"); @@ -437,7 +437,7 @@ static int wm8350_rtc_probe(struct platform_device *pdev) do { timectl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); - } while (timectl & WM8350_RTC_STS && retries--); + } while (timectl & WM8350_RTC_STS && --retries); if (retries == 0) { dev_err(&pdev->dev, "failed to start: timeout\n"); From c08cf9daf66844c60ebe9f89885d3a3e1893e61f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 31 Mar 2009 15:24:59 -0700 Subject: [PATCH 413/478] rtc-v3020: coding style cleanup Signed-off-by: Mike Rapoport Acked-by: Alessandro Zummo Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-v3020.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index 14d4f036a768..66955cc9c746 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -28,7 +28,7 @@ #include #include -#include +#include #undef DEBUG @@ -63,7 +63,7 @@ static void v3020_set_reg(struct v3020 *chip, unsigned char address, static unsigned char v3020_get_reg(struct v3020 *chip, unsigned char address) { - unsigned int data=0; + unsigned int data = 0; int i; for (i = 0; i < 4; i++) { @@ -106,16 +106,14 @@ static int v3020_read_time(struct device *dev, struct rtc_time *dt) tmp = v3020_get_reg(chip, V3020_YEAR); dt->tm_year = bcd2bin(tmp)+100; -#ifdef DEBUG - printk("\n%s : Read RTC values\n",__func__); - printk("tm_hour: %i\n",dt->tm_hour); - printk("tm_min : %i\n",dt->tm_min); - printk("tm_sec : %i\n",dt->tm_sec); - printk("tm_year: %i\n",dt->tm_year); - printk("tm_mon : %i\n",dt->tm_mon); - printk("tm_mday: %i\n",dt->tm_mday); - printk("tm_wday: %i\n",dt->tm_wday); -#endif + dev_dbg(dev, "\n%s : Read RTC values\n", __func__); + dev_dbg(dev, "tm_hour: %i\n", dt->tm_hour); + dev_dbg(dev, "tm_min : %i\n", dt->tm_min); + dev_dbg(dev, "tm_sec : %i\n", dt->tm_sec); + dev_dbg(dev, "tm_year: %i\n", dt->tm_year); + dev_dbg(dev, "tm_mon : %i\n", dt->tm_mon); + dev_dbg(dev, "tm_mday: %i\n", dt->tm_mday); + dev_dbg(dev, "tm_wday: %i\n", dt->tm_wday); return 0; } @@ -125,15 +123,13 @@ static int v3020_set_time(struct device *dev, struct rtc_time *dt) { struct v3020 *chip = dev_get_drvdata(dev); -#ifdef DEBUG - printk("\n%s : Setting RTC values\n",__func__); - printk("tm_sec : %i\n",dt->tm_sec); - printk("tm_min : %i\n",dt->tm_min); - printk("tm_hour: %i\n",dt->tm_hour); - printk("tm_mday: %i\n",dt->tm_mday); - printk("tm_wday: %i\n",dt->tm_wday); - printk("tm_year: %i\n",dt->tm_year); -#endif + dev_dbg(dev, "\n%s : Setting RTC values\n", __func__); + dev_dbg(dev, "tm_sec : %i\n", dt->tm_sec); + dev_dbg(dev, "tm_min : %i\n", dt->tm_min); + dev_dbg(dev, "tm_hour: %i\n", dt->tm_hour); + dev_dbg(dev, "tm_mday: %i\n", dt->tm_mday); + dev_dbg(dev, "tm_wday: %i\n", dt->tm_wday); + dev_dbg(dev, "tm_year: %i\n", dt->tm_year); /* Write all the values to ram... */ v3020_set_reg(chip, V3020_SECONDS, bin2bcd(dt->tm_sec)); @@ -191,7 +187,7 @@ static int rtc_probe(struct platform_device *pdev) /* Test chip by doing a write/read sequence * to the chip ram */ v3020_set_reg(chip, V3020_SECONDS, 0x33); - if(v3020_get_reg(chip, V3020_SECONDS) != 0x33) { + if (v3020_get_reg(chip, V3020_SECONDS) != 0x33) { retval = -ENODEV; goto err_io; } From fa7af8b1bb6dfca7a0c8541683a9bfffbc8dd345 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:25:00 -0700 Subject: [PATCH 414/478] rtc: test before subtraction on unsigned new_alarm is unsigned so test before the subtraction. [akpm@linux-foundation.org: time-wrapping fix] Signed-off-by: Roel Kluin Cc: David Brownell Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1374.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c index a5b0fc09f0c6..4d32e328f6cd 100644 --- a/drivers/rtc/rtc-ds1374.c +++ b/drivers/rtc/rtc-ds1374.c @@ -222,16 +222,16 @@ static int ds1374_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) rtc_tm_to_time(&alarm->time, &new_alarm); rtc_tm_to_time(&now, &itime); - new_alarm -= itime; - /* This can happen due to races, in addition to dates that are * truly in the past. To avoid requiring the caller to check for * races, dates in the past are assumed to be in the recent past * (i.e. not something that we'd rather the caller know about via * an error), and the alarm is set to go off as soon as possible. */ - if (new_alarm <= 0) + if (time_before_eq(new_alarm, itime)) new_alarm = 1; + else + new_alarm -= itime; mutex_lock(&ds1374->mutex); From 6e6fe42227e23a379d3c70f6ff257131399e4075 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Mar 2009 15:25:01 -0700 Subject: [PATCH 415/478] drivers/video/uvesafb.c: don't use gfp_any() GFP_KERNEL is legal here - we don't need to use gfp_any(). Cc: Evgeniy Polyakov Cc: "David S. Miller" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/uvesafb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c index 74ae75899009..398fd25e0810 100644 --- a/drivers/video/uvesafb.c +++ b/drivers/video/uvesafb.c @@ -189,7 +189,7 @@ static int uvesafb_exec(struct uvesafb_ktask *task) uvfb_tasks[seq] = task; mutex_unlock(&uvfb_lock); - err = cn_netlink_send(m, 0, gfp_any()); + err = cn_netlink_send(m, 0, GFP_KERNEL); if (err == -ESRCH) { /* * Try to start the userspace helper if sending From d5cb78feee7b6631f578e12bda1e86eea7923637 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 31 Mar 2009 15:25:02 -0700 Subject: [PATCH 416/478] atyfb: fix header file trailing whitespace Fix trailing whitespace because quilt complained about it. Signed-off-by: Randy Dunlap Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/video/aty128.h | 2 +- include/video/radeon.h | 564 ++++++++++++++++++++--------------------- 2 files changed, 283 insertions(+), 283 deletions(-) diff --git a/include/video/aty128.h b/include/video/aty128.h index 51ac69f05bdc..f0851e3bb7cc 100644 --- a/include/video/aty128.h +++ b/include/video/aty128.h @@ -415,7 +415,7 @@ #define PWR_MGT_SLOWDOWN_MCLK 0x00002000 #define PMI_PMSCR_REG 0x60 - + /* used by ATI bug fix for hardware ROM */ #define RAGE128_MPP_TB_CONFIG 0x01c0 diff --git a/include/video/radeon.h b/include/video/radeon.h index e072b16b39ab..56b188abfb54 100644 --- a/include/video/radeon.h +++ b/include/video/radeon.h @@ -5,12 +5,12 @@ #define RADEON_REGSIZE 0x4000 -#define MM_INDEX 0x0000 -#define MM_DATA 0x0004 -#define BUS_CNTL 0x0030 -#define HI_STAT 0x004C +#define MM_INDEX 0x0000 +#define MM_DATA 0x0004 +#define BUS_CNTL 0x0030 +#define HI_STAT 0x004C #define BUS_CNTL1 0x0034 -#define I2C_CNTL_1 0x0094 +#define I2C_CNTL_1 0x0094 #define CNFG_CNTL 0x00E0 #define CNFG_MEMSIZE 0x00F8 #define CNFG_APER_0_BASE 0x0100 @@ -18,8 +18,8 @@ #define CNFG_APER_SIZE 0x0108 #define CNFG_REG_1_BASE 0x010C #define CNFG_REG_APER_SIZE 0x0110 -#define PAD_AGPINPUT_DELAY 0x0164 -#define PAD_CTLR_STRENGTH 0x0168 +#define PAD_AGPINPUT_DELAY 0x0164 +#define PAD_CTLR_STRENGTH 0x0168 #define PAD_CTLR_UPDATE 0x016C #define PAD_CTLR_MISC 0x0aa0 #define AGP_CNTL 0x0174 @@ -27,171 +27,171 @@ #define CAP0_TRIG_CNTL 0x0950 #define CAP1_TRIG_CNTL 0x09c0 #define VIPH_CONTROL 0x0C40 -#define VENDOR_ID 0x0F00 -#define DEVICE_ID 0x0F02 -#define COMMAND 0x0F04 -#define STATUS 0x0F06 -#define REVISION_ID 0x0F08 -#define REGPROG_INF 0x0F09 -#define SUB_CLASS 0x0F0A -#define BASE_CODE 0x0F0B -#define CACHE_LINE 0x0F0C -#define LATENCY 0x0F0D -#define HEADER 0x0F0E -#define BIST 0x0F0F -#define REG_MEM_BASE 0x0F10 -#define REG_IO_BASE 0x0F14 +#define VENDOR_ID 0x0F00 +#define DEVICE_ID 0x0F02 +#define COMMAND 0x0F04 +#define STATUS 0x0F06 +#define REVISION_ID 0x0F08 +#define REGPROG_INF 0x0F09 +#define SUB_CLASS 0x0F0A +#define BASE_CODE 0x0F0B +#define CACHE_LINE 0x0F0C +#define LATENCY 0x0F0D +#define HEADER 0x0F0E +#define BIST 0x0F0F +#define REG_MEM_BASE 0x0F10 +#define REG_IO_BASE 0x0F14 #define REG_REG_BASE 0x0F18 #define ADAPTER_ID 0x0F2C #define BIOS_ROM 0x0F30 -#define CAPABILITIES_PTR 0x0F34 -#define INTERRUPT_LINE 0x0F3C -#define INTERRUPT_PIN 0x0F3D -#define MIN_GRANT 0x0F3E -#define MAX_LATENCY 0x0F3F -#define ADAPTER_ID_W 0x0F4C -#define PMI_CAP_ID 0x0F50 -#define PMI_NXT_CAP_PTR 0x0F51 -#define PMI_PMC_REG 0x0F52 -#define PM_STATUS 0x0F54 -#define PMI_DATA 0x0F57 -#define AGP_CAP_ID 0x0F58 -#define AGP_STATUS 0x0F5C -#define AGP_COMMAND 0x0F60 +#define CAPABILITIES_PTR 0x0F34 +#define INTERRUPT_LINE 0x0F3C +#define INTERRUPT_PIN 0x0F3D +#define MIN_GRANT 0x0F3E +#define MAX_LATENCY 0x0F3F +#define ADAPTER_ID_W 0x0F4C +#define PMI_CAP_ID 0x0F50 +#define PMI_NXT_CAP_PTR 0x0F51 +#define PMI_PMC_REG 0x0F52 +#define PM_STATUS 0x0F54 +#define PMI_DATA 0x0F57 +#define AGP_CAP_ID 0x0F58 +#define AGP_STATUS 0x0F5C +#define AGP_COMMAND 0x0F60 #define AIC_CTRL 0x01D0 #define AIC_STAT 0x01D4 #define AIC_PT_BASE 0x01D8 -#define AIC_LO_ADDR 0x01DC -#define AIC_HI_ADDR 0x01E0 -#define AIC_TLB_ADDR 0x01E4 -#define AIC_TLB_DATA 0x01E8 -#define DAC_CNTL 0x0058 +#define AIC_LO_ADDR 0x01DC +#define AIC_HI_ADDR 0x01E0 +#define AIC_TLB_ADDR 0x01E4 +#define AIC_TLB_DATA 0x01E8 +#define DAC_CNTL 0x0058 #define DAC_CNTL2 0x007c -#define CRTC_GEN_CNTL 0x0050 -#define MEM_CNTL 0x0140 +#define CRTC_GEN_CNTL 0x0050 +#define MEM_CNTL 0x0140 #define MC_CNTL 0x0140 -#define EXT_MEM_CNTL 0x0144 +#define EXT_MEM_CNTL 0x0144 #define MC_TIMING_CNTL 0x0144 -#define MC_AGP_LOCATION 0x014C -#define MEM_IO_CNTL_A0 0x0178 +#define MC_AGP_LOCATION 0x014C +#define MEM_IO_CNTL_A0 0x0178 #define MEM_REFRESH_CNTL 0x0178 -#define MEM_INIT_LATENCY_TIMER 0x0154 +#define MEM_INIT_LATENCY_TIMER 0x0154 #define MC_INIT_GFX_LAT_TIMER 0x0154 -#define MEM_SDRAM_MODE_REG 0x0158 -#define AGP_BASE 0x0170 -#define MEM_IO_CNTL_A1 0x017C +#define MEM_SDRAM_MODE_REG 0x0158 +#define AGP_BASE 0x0170 +#define MEM_IO_CNTL_A1 0x017C #define MC_READ_CNTL_AB 0x017C #define MEM_IO_CNTL_B0 0x0180 #define MC_INIT_MISC_LAT_TIMER 0x0180 #define MEM_IO_CNTL_B1 0x0184 #define MC_IOPAD_CNTL 0x0184 #define MC_DEBUG 0x0188 -#define MC_STATUS 0x0150 -#define MEM_IO_OE_CNTL 0x018C +#define MC_STATUS 0x0150 +#define MEM_IO_OE_CNTL 0x018C #define MC_CHIP_IO_OE_CNTL_AB 0x018C -#define MC_FB_LOCATION 0x0148 -#define HOST_PATH_CNTL 0x0130 -#define MEM_VGA_WP_SEL 0x0038 -#define MEM_VGA_RP_SEL 0x003C -#define HDP_DEBUG 0x0138 +#define MC_FB_LOCATION 0x0148 +#define HOST_PATH_CNTL 0x0130 +#define MEM_VGA_WP_SEL 0x0038 +#define MEM_VGA_RP_SEL 0x003C +#define HDP_DEBUG 0x0138 #define SW_SEMAPHORE 0x013C -#define CRTC2_GEN_CNTL 0x03f8 +#define CRTC2_GEN_CNTL 0x03f8 #define CRTC2_DISPLAY_BASE_ADDR 0x033c -#define SURFACE_CNTL 0x0B00 -#define SURFACE0_LOWER_BOUND 0x0B04 -#define SURFACE1_LOWER_BOUND 0x0B14 -#define SURFACE2_LOWER_BOUND 0x0B24 -#define SURFACE3_LOWER_BOUND 0x0B34 -#define SURFACE4_LOWER_BOUND 0x0B44 +#define SURFACE_CNTL 0x0B00 +#define SURFACE0_LOWER_BOUND 0x0B04 +#define SURFACE1_LOWER_BOUND 0x0B14 +#define SURFACE2_LOWER_BOUND 0x0B24 +#define SURFACE3_LOWER_BOUND 0x0B34 +#define SURFACE4_LOWER_BOUND 0x0B44 #define SURFACE5_LOWER_BOUND 0x0B54 #define SURFACE6_LOWER_BOUND 0x0B64 #define SURFACE7_LOWER_BOUND 0x0B74 -#define SURFACE0_UPPER_BOUND 0x0B08 -#define SURFACE1_UPPER_BOUND 0x0B18 -#define SURFACE2_UPPER_BOUND 0x0B28 -#define SURFACE3_UPPER_BOUND 0x0B38 -#define SURFACE4_UPPER_BOUND 0x0B48 -#define SURFACE5_UPPER_BOUND 0x0B58 -#define SURFACE6_UPPER_BOUND 0x0B68 -#define SURFACE7_UPPER_BOUND 0x0B78 -#define SURFACE0_INFO 0x0B0C -#define SURFACE1_INFO 0x0B1C -#define SURFACE2_INFO 0x0B2C -#define SURFACE3_INFO 0x0B3C -#define SURFACE4_INFO 0x0B4C -#define SURFACE5_INFO 0x0B5C +#define SURFACE0_UPPER_BOUND 0x0B08 +#define SURFACE1_UPPER_BOUND 0x0B18 +#define SURFACE2_UPPER_BOUND 0x0B28 +#define SURFACE3_UPPER_BOUND 0x0B38 +#define SURFACE4_UPPER_BOUND 0x0B48 +#define SURFACE5_UPPER_BOUND 0x0B58 +#define SURFACE6_UPPER_BOUND 0x0B68 +#define SURFACE7_UPPER_BOUND 0x0B78 +#define SURFACE0_INFO 0x0B0C +#define SURFACE1_INFO 0x0B1C +#define SURFACE2_INFO 0x0B2C +#define SURFACE3_INFO 0x0B3C +#define SURFACE4_INFO 0x0B4C +#define SURFACE5_INFO 0x0B5C #define SURFACE6_INFO 0x0B6C #define SURFACE7_INFO 0x0B7C #define SURFACE_ACCESS_FLAGS 0x0BF8 -#define SURFACE_ACCESS_CLR 0x0BFC -#define GEN_INT_CNTL 0x0040 -#define GEN_INT_STATUS 0x0044 +#define SURFACE_ACCESS_CLR 0x0BFC +#define GEN_INT_CNTL 0x0040 +#define GEN_INT_STATUS 0x0044 #define CRTC_EXT_CNTL 0x0054 -#define RB3D_CNTL 0x1C3C -#define WAIT_UNTIL 0x1720 -#define ISYNC_CNTL 0x1724 -#define RBBM_GUICNTL 0x172C -#define RBBM_STATUS 0x0E40 -#define RBBM_STATUS_alt_1 0x1740 -#define RBBM_CNTL 0x00EC -#define RBBM_CNTL_alt_1 0x0E44 -#define RBBM_SOFT_RESET 0x00F0 -#define RBBM_SOFT_RESET_alt_1 0x0E48 -#define NQWAIT_UNTIL 0x0E50 +#define RB3D_CNTL 0x1C3C +#define WAIT_UNTIL 0x1720 +#define ISYNC_CNTL 0x1724 +#define RBBM_GUICNTL 0x172C +#define RBBM_STATUS 0x0E40 +#define RBBM_STATUS_alt_1 0x1740 +#define RBBM_CNTL 0x00EC +#define RBBM_CNTL_alt_1 0x0E44 +#define RBBM_SOFT_RESET 0x00F0 +#define RBBM_SOFT_RESET_alt_1 0x0E48 +#define NQWAIT_UNTIL 0x0E50 #define RBBM_DEBUG 0x0E6C #define RBBM_CMDFIFO_ADDR 0x0E70 #define RBBM_CMDFIFO_DATAL 0x0E74 -#define RBBM_CMDFIFO_DATAH 0x0E78 -#define RBBM_CMDFIFO_STAT 0x0E7C -#define CRTC_STATUS 0x005C -#define GPIO_VGA_DDC 0x0060 -#define GPIO_DVI_DDC 0x0064 -#define GPIO_MONID 0x0068 +#define RBBM_CMDFIFO_DATAH 0x0E78 +#define RBBM_CMDFIFO_STAT 0x0E7C +#define CRTC_STATUS 0x005C +#define GPIO_VGA_DDC 0x0060 +#define GPIO_DVI_DDC 0x0064 +#define GPIO_MONID 0x0068 #define GPIO_CRT2_DDC 0x006c -#define PALETTE_INDEX 0x00B0 -#define PALETTE_DATA 0x00B4 -#define PALETTE_30_DATA 0x00B8 -#define CRTC_H_TOTAL_DISP 0x0200 -#define CRTC_H_SYNC_STRT_WID 0x0204 -#define CRTC_V_TOTAL_DISP 0x0208 -#define CRTC_V_SYNC_STRT_WID 0x020C -#define CRTC_VLINE_CRNT_VLINE 0x0210 +#define PALETTE_INDEX 0x00B0 +#define PALETTE_DATA 0x00B4 +#define PALETTE_30_DATA 0x00B8 +#define CRTC_H_TOTAL_DISP 0x0200 +#define CRTC_H_SYNC_STRT_WID 0x0204 +#define CRTC_V_TOTAL_DISP 0x0208 +#define CRTC_V_SYNC_STRT_WID 0x020C +#define CRTC_VLINE_CRNT_VLINE 0x0210 #define CRTC_CRNT_FRAME 0x0214 #define CRTC_GUI_TRIG_VLINE 0x0218 #define CRTC_DEBUG 0x021C -#define CRTC_OFFSET_RIGHT 0x0220 -#define CRTC_OFFSET 0x0224 -#define CRTC_OFFSET_CNTL 0x0228 -#define CRTC_PITCH 0x022C -#define OVR_CLR 0x0230 -#define OVR_WID_LEFT_RIGHT 0x0234 -#define OVR_WID_TOP_BOTTOM 0x0238 -#define DISPLAY_BASE_ADDR 0x023C -#define SNAPSHOT_VH_COUNTS 0x0240 -#define SNAPSHOT_F_COUNT 0x0244 -#define N_VIF_COUNT 0x0248 -#define SNAPSHOT_VIF_COUNT 0x024C -#define FP_CRTC_H_TOTAL_DISP 0x0250 -#define FP_CRTC_V_TOTAL_DISP 0x0254 +#define CRTC_OFFSET_RIGHT 0x0220 +#define CRTC_OFFSET 0x0224 +#define CRTC_OFFSET_CNTL 0x0228 +#define CRTC_PITCH 0x022C +#define OVR_CLR 0x0230 +#define OVR_WID_LEFT_RIGHT 0x0234 +#define OVR_WID_TOP_BOTTOM 0x0238 +#define DISPLAY_BASE_ADDR 0x023C +#define SNAPSHOT_VH_COUNTS 0x0240 +#define SNAPSHOT_F_COUNT 0x0244 +#define N_VIF_COUNT 0x0248 +#define SNAPSHOT_VIF_COUNT 0x024C +#define FP_CRTC_H_TOTAL_DISP 0x0250 +#define FP_CRTC_V_TOTAL_DISP 0x0254 #define CRT_CRTC_H_SYNC_STRT_WID 0x0258 #define CRT_CRTC_V_SYNC_STRT_WID 0x025C #define CUR_OFFSET 0x0260 -#define CUR_HORZ_VERT_POSN 0x0264 -#define CUR_HORZ_VERT_OFF 0x0268 -#define CUR_CLR0 0x026C -#define CUR_CLR1 0x0270 -#define FP_HORZ_VERT_ACTIVE 0x0278 -#define CRTC_MORE_CNTL 0x027C +#define CUR_HORZ_VERT_POSN 0x0264 +#define CUR_HORZ_VERT_OFF 0x0268 +#define CUR_CLR0 0x026C +#define CUR_CLR1 0x0270 +#define FP_HORZ_VERT_ACTIVE 0x0278 +#define CRTC_MORE_CNTL 0x027C #define CRTC_H_CUTOFF_ACTIVE_EN (1<<4) #define CRTC_V_CUTOFF_ACTIVE_EN (1<<5) -#define DAC_EXT_CNTL 0x0280 -#define FP_GEN_CNTL 0x0284 -#define FP_HORZ_STRETCH 0x028C -#define FP_VERT_STRETCH 0x0290 -#define FP_H_SYNC_STRT_WID 0x02C4 -#define FP_V_SYNC_STRT_WID 0x02C8 -#define AUX_WINDOW_HORZ_CNTL 0x02D8 -#define AUX_WINDOW_VERT_CNTL 0x02DC +#define DAC_EXT_CNTL 0x0280 +#define FP_GEN_CNTL 0x0284 +#define FP_HORZ_STRETCH 0x028C +#define FP_VERT_STRETCH 0x0290 +#define FP_H_SYNC_STRT_WID 0x02C4 +#define FP_V_SYNC_STRT_WID 0x02C8 +#define AUX_WINDOW_HORZ_CNTL 0x02D8 +#define AUX_WINDOW_VERT_CNTL 0x02DC //#define DDA_CONFIG 0x02e0 //#define DDA_ON_OFF 0x02e4 #define DVI_I2C_CNTL_1 0x02e4 @@ -199,192 +199,192 @@ #define GRPH2_BUFFER_CNTL 0x03F0 #define VGA_BUFFER_CNTL 0x02F4 #define OV0_Y_X_START 0x0400 -#define OV0_Y_X_END 0x0404 -#define OV0_PIPELINE_CNTL 0x0408 -#define OV0_REG_LOAD_CNTL 0x0410 -#define OV0_SCALE_CNTL 0x0420 -#define OV0_V_INC 0x0424 -#define OV0_P1_V_ACCUM_INIT 0x0428 -#define OV0_P23_V_ACCUM_INIT 0x042C -#define OV0_P1_BLANK_LINES_AT_TOP 0x0430 -#define OV0_P23_BLANK_LINES_AT_TOP 0x0434 -#define OV0_BASE_ADDR 0x043C -#define OV0_VID_BUF0_BASE_ADRS 0x0440 -#define OV0_VID_BUF1_BASE_ADRS 0x0444 -#define OV0_VID_BUF2_BASE_ADRS 0x0448 -#define OV0_VID_BUF3_BASE_ADRS 0x044C +#define OV0_Y_X_END 0x0404 +#define OV0_PIPELINE_CNTL 0x0408 +#define OV0_REG_LOAD_CNTL 0x0410 +#define OV0_SCALE_CNTL 0x0420 +#define OV0_V_INC 0x0424 +#define OV0_P1_V_ACCUM_INIT 0x0428 +#define OV0_P23_V_ACCUM_INIT 0x042C +#define OV0_P1_BLANK_LINES_AT_TOP 0x0430 +#define OV0_P23_BLANK_LINES_AT_TOP 0x0434 +#define OV0_BASE_ADDR 0x043C +#define OV0_VID_BUF0_BASE_ADRS 0x0440 +#define OV0_VID_BUF1_BASE_ADRS 0x0444 +#define OV0_VID_BUF2_BASE_ADRS 0x0448 +#define OV0_VID_BUF3_BASE_ADRS 0x044C #define OV0_VID_BUF4_BASE_ADRS 0x0450 #define OV0_VID_BUF5_BASE_ADRS 0x0454 #define OV0_VID_BUF_PITCH0_VALUE 0x0460 -#define OV0_VID_BUF_PITCH1_VALUE 0x0464 -#define OV0_AUTO_FLIP_CNTRL 0x0470 -#define OV0_DEINTERLACE_PATTERN 0x0474 -#define OV0_SUBMIT_HISTORY 0x0478 -#define OV0_H_INC 0x0480 -#define OV0_STEP_BY 0x0484 -#define OV0_P1_H_ACCUM_INIT 0x0488 -#define OV0_P23_H_ACCUM_INIT 0x048C -#define OV0_P1_X_START_END 0x0494 -#define OV0_P2_X_START_END 0x0498 -#define OV0_P3_X_START_END 0x049C -#define OV0_FILTER_CNTL 0x04A0 -#define OV0_FOUR_TAP_COEF_0 0x04B0 -#define OV0_FOUR_TAP_COEF_1 0x04B4 +#define OV0_VID_BUF_PITCH1_VALUE 0x0464 +#define OV0_AUTO_FLIP_CNTRL 0x0470 +#define OV0_DEINTERLACE_PATTERN 0x0474 +#define OV0_SUBMIT_HISTORY 0x0478 +#define OV0_H_INC 0x0480 +#define OV0_STEP_BY 0x0484 +#define OV0_P1_H_ACCUM_INIT 0x0488 +#define OV0_P23_H_ACCUM_INIT 0x048C +#define OV0_P1_X_START_END 0x0494 +#define OV0_P2_X_START_END 0x0498 +#define OV0_P3_X_START_END 0x049C +#define OV0_FILTER_CNTL 0x04A0 +#define OV0_FOUR_TAP_COEF_0 0x04B0 +#define OV0_FOUR_TAP_COEF_1 0x04B4 #define OV0_FOUR_TAP_COEF_2 0x04B8 #define OV0_FOUR_TAP_COEF_3 0x04BC #define OV0_FOUR_TAP_COEF_4 0x04C0 -#define OV0_FLAG_CNTRL 0x04DC -#define OV0_SLICE_CNTL 0x04E0 -#define OV0_VID_KEY_CLR_LOW 0x04E4 -#define OV0_VID_KEY_CLR_HIGH 0x04E8 -#define OV0_GRPH_KEY_CLR_LOW 0x04EC -#define OV0_GRPH_KEY_CLR_HIGH 0x04F0 -#define OV0_KEY_CNTL 0x04F4 -#define OV0_TEST 0x04F8 -#define SUBPIC_CNTL 0x0540 -#define SUBPIC_DEFCOLCON 0x0544 -#define SUBPIC_Y_X_START 0x054C -#define SUBPIC_Y_X_END 0x0550 -#define SUBPIC_V_INC 0x0554 -#define SUBPIC_H_INC 0x0558 +#define OV0_FLAG_CNTRL 0x04DC +#define OV0_SLICE_CNTL 0x04E0 +#define OV0_VID_KEY_CLR_LOW 0x04E4 +#define OV0_VID_KEY_CLR_HIGH 0x04E8 +#define OV0_GRPH_KEY_CLR_LOW 0x04EC +#define OV0_GRPH_KEY_CLR_HIGH 0x04F0 +#define OV0_KEY_CNTL 0x04F4 +#define OV0_TEST 0x04F8 +#define SUBPIC_CNTL 0x0540 +#define SUBPIC_DEFCOLCON 0x0544 +#define SUBPIC_Y_X_START 0x054C +#define SUBPIC_Y_X_END 0x0550 +#define SUBPIC_V_INC 0x0554 +#define SUBPIC_H_INC 0x0558 #define SUBPIC_BUF0_OFFSET 0x055C #define SUBPIC_BUF1_OFFSET 0x0560 #define SUBPIC_LC0_OFFSET 0x0564 -#define SUBPIC_LC1_OFFSET 0x0568 -#define SUBPIC_PITCH 0x056C -#define SUBPIC_BTN_HLI_COLCON 0x0570 -#define SUBPIC_BTN_HLI_Y_X_START 0x0574 -#define SUBPIC_BTN_HLI_Y_X_END 0x0578 -#define SUBPIC_PALETTE_INDEX 0x057C -#define SUBPIC_PALETTE_DATA 0x0580 -#define SUBPIC_H_ACCUM_INIT 0x0584 -#define SUBPIC_V_ACCUM_INIT 0x0588 -#define DISP_MISC_CNTL 0x0D00 -#define DAC_MACRO_CNTL 0x0D04 -#define DISP_PWR_MAN 0x0D08 -#define DISP_TEST_DEBUG_CNTL 0x0D10 -#define DISP_HW_DEBUG 0x0D14 +#define SUBPIC_LC1_OFFSET 0x0568 +#define SUBPIC_PITCH 0x056C +#define SUBPIC_BTN_HLI_COLCON 0x0570 +#define SUBPIC_BTN_HLI_Y_X_START 0x0574 +#define SUBPIC_BTN_HLI_Y_X_END 0x0578 +#define SUBPIC_PALETTE_INDEX 0x057C +#define SUBPIC_PALETTE_DATA 0x0580 +#define SUBPIC_H_ACCUM_INIT 0x0584 +#define SUBPIC_V_ACCUM_INIT 0x0588 +#define DISP_MISC_CNTL 0x0D00 +#define DAC_MACRO_CNTL 0x0D04 +#define DISP_PWR_MAN 0x0D08 +#define DISP_TEST_DEBUG_CNTL 0x0D10 +#define DISP_HW_DEBUG 0x0D14 #define DAC_CRC_SIG1 0x0D18 #define DAC_CRC_SIG2 0x0D1C #define OV0_LIN_TRANS_A 0x0D20 -#define OV0_LIN_TRANS_B 0x0D24 -#define OV0_LIN_TRANS_C 0x0D28 -#define OV0_LIN_TRANS_D 0x0D2C -#define OV0_LIN_TRANS_E 0x0D30 -#define OV0_LIN_TRANS_F 0x0D34 -#define OV0_GAMMA_0_F 0x0D40 -#define OV0_GAMMA_10_1F 0x0D44 -#define OV0_GAMMA_20_3F 0x0D48 -#define OV0_GAMMA_40_7F 0x0D4C -#define OV0_GAMMA_380_3BF 0x0D50 -#define OV0_GAMMA_3C0_3FF 0x0D54 -#define DISP_MERGE_CNTL 0x0D60 -#define DISP_OUTPUT_CNTL 0x0D64 -#define DISP_LIN_TRANS_GRPH_A 0x0D80 +#define OV0_LIN_TRANS_B 0x0D24 +#define OV0_LIN_TRANS_C 0x0D28 +#define OV0_LIN_TRANS_D 0x0D2C +#define OV0_LIN_TRANS_E 0x0D30 +#define OV0_LIN_TRANS_F 0x0D34 +#define OV0_GAMMA_0_F 0x0D40 +#define OV0_GAMMA_10_1F 0x0D44 +#define OV0_GAMMA_20_3F 0x0D48 +#define OV0_GAMMA_40_7F 0x0D4C +#define OV0_GAMMA_380_3BF 0x0D50 +#define OV0_GAMMA_3C0_3FF 0x0D54 +#define DISP_MERGE_CNTL 0x0D60 +#define DISP_OUTPUT_CNTL 0x0D64 +#define DISP_LIN_TRANS_GRPH_A 0x0D80 #define DISP_LIN_TRANS_GRPH_B 0x0D84 #define DISP_LIN_TRANS_GRPH_C 0x0D88 #define DISP_LIN_TRANS_GRPH_D 0x0D8C -#define DISP_LIN_TRANS_GRPH_E 0x0D90 -#define DISP_LIN_TRANS_GRPH_F 0x0D94 -#define DISP_LIN_TRANS_VID_A 0x0D98 -#define DISP_LIN_TRANS_VID_B 0x0D9C -#define DISP_LIN_TRANS_VID_C 0x0DA0 -#define DISP_LIN_TRANS_VID_D 0x0DA4 -#define DISP_LIN_TRANS_VID_E 0x0DA8 -#define DISP_LIN_TRANS_VID_F 0x0DAC -#define RMX_HORZ_FILTER_0TAP_COEF 0x0DB0 -#define RMX_HORZ_FILTER_1TAP_COEF 0x0DB4 -#define RMX_HORZ_FILTER_2TAP_COEF 0x0DB8 -#define RMX_HORZ_PHASE 0x0DBC -#define DAC_EMBEDDED_SYNC_CNTL 0x0DC0 -#define DAC_BROAD_PULSE 0x0DC4 +#define DISP_LIN_TRANS_GRPH_E 0x0D90 +#define DISP_LIN_TRANS_GRPH_F 0x0D94 +#define DISP_LIN_TRANS_VID_A 0x0D98 +#define DISP_LIN_TRANS_VID_B 0x0D9C +#define DISP_LIN_TRANS_VID_C 0x0DA0 +#define DISP_LIN_TRANS_VID_D 0x0DA4 +#define DISP_LIN_TRANS_VID_E 0x0DA8 +#define DISP_LIN_TRANS_VID_F 0x0DAC +#define RMX_HORZ_FILTER_0TAP_COEF 0x0DB0 +#define RMX_HORZ_FILTER_1TAP_COEF 0x0DB4 +#define RMX_HORZ_FILTER_2TAP_COEF 0x0DB8 +#define RMX_HORZ_PHASE 0x0DBC +#define DAC_EMBEDDED_SYNC_CNTL 0x0DC0 +#define DAC_BROAD_PULSE 0x0DC4 #define DAC_SKEW_CLKS 0x0DC8 #define DAC_INCR 0x0DCC #define DAC_NEG_SYNC_LEVEL 0x0DD0 -#define DAC_POS_SYNC_LEVEL 0x0DD4 -#define DAC_BLANK_LEVEL 0x0DD8 -#define CLOCK_CNTL_INDEX 0x0008 -#define CLOCK_CNTL_DATA 0x000C -#define CP_RB_CNTL 0x0704 -#define CP_RB_BASE 0x0700 -#define CP_RB_RPTR_ADDR 0x070C -#define CP_RB_RPTR 0x0710 -#define CP_RB_WPTR 0x0714 -#define CP_RB_WPTR_DELAY 0x0718 -#define CP_IB_BASE 0x0738 -#define CP_IB_BUFSZ 0x073C -#define SCRATCH_REG0 0x15E0 -#define GUI_SCRATCH_REG0 0x15E0 -#define SCRATCH_REG1 0x15E4 -#define GUI_SCRATCH_REG1 0x15E4 +#define DAC_POS_SYNC_LEVEL 0x0DD4 +#define DAC_BLANK_LEVEL 0x0DD8 +#define CLOCK_CNTL_INDEX 0x0008 +#define CLOCK_CNTL_DATA 0x000C +#define CP_RB_CNTL 0x0704 +#define CP_RB_BASE 0x0700 +#define CP_RB_RPTR_ADDR 0x070C +#define CP_RB_RPTR 0x0710 +#define CP_RB_WPTR 0x0714 +#define CP_RB_WPTR_DELAY 0x0718 +#define CP_IB_BASE 0x0738 +#define CP_IB_BUFSZ 0x073C +#define SCRATCH_REG0 0x15E0 +#define GUI_SCRATCH_REG0 0x15E0 +#define SCRATCH_REG1 0x15E4 +#define GUI_SCRATCH_REG1 0x15E4 #define SCRATCH_REG2 0x15E8 #define GUI_SCRATCH_REG2 0x15E8 #define SCRATCH_REG3 0x15EC -#define GUI_SCRATCH_REG3 0x15EC -#define SCRATCH_REG4 0x15F0 -#define GUI_SCRATCH_REG4 0x15F0 -#define SCRATCH_REG5 0x15F4 -#define GUI_SCRATCH_REG5 0x15F4 -#define SCRATCH_UMSK 0x0770 -#define SCRATCH_ADDR 0x0774 -#define DP_BRUSH_FRGD_CLR 0x147C +#define GUI_SCRATCH_REG3 0x15EC +#define SCRATCH_REG4 0x15F0 +#define GUI_SCRATCH_REG4 0x15F0 +#define SCRATCH_REG5 0x15F4 +#define GUI_SCRATCH_REG5 0x15F4 +#define SCRATCH_UMSK 0x0770 +#define SCRATCH_ADDR 0x0774 +#define DP_BRUSH_FRGD_CLR 0x147C #define DP_BRUSH_BKGD_CLR 0x1478 #define DST_LINE_START 0x1600 -#define DST_LINE_END 0x1604 -#define SRC_OFFSET 0x15AC +#define DST_LINE_END 0x1604 +#define SRC_OFFSET 0x15AC #define SRC_PITCH 0x15B0 #define SRC_TILE 0x1704 #define SRC_PITCH_OFFSET 0x1428 -#define SRC_X 0x1414 -#define SRC_Y 0x1418 -#define SRC_X_Y 0x1590 -#define SRC_Y_X 0x1434 +#define SRC_X 0x1414 +#define SRC_Y 0x1418 +#define SRC_X_Y 0x1590 +#define SRC_Y_X 0x1434 #define DST_Y_X 0x1438 #define DST_WIDTH_HEIGHT 0x1598 #define DST_HEIGHT_WIDTH 0x143c #define DST_OFFSET 0x1404 -#define SRC_CLUT_ADDRESS 0x1780 -#define SRC_CLUT_DATA 0x1784 -#define SRC_CLUT_DATA_RD 0x1788 -#define HOST_DATA0 0x17C0 -#define HOST_DATA1 0x17C4 -#define HOST_DATA2 0x17C8 -#define HOST_DATA3 0x17CC -#define HOST_DATA4 0x17D0 -#define HOST_DATA5 0x17D4 -#define HOST_DATA6 0x17D8 +#define SRC_CLUT_ADDRESS 0x1780 +#define SRC_CLUT_DATA 0x1784 +#define SRC_CLUT_DATA_RD 0x1788 +#define HOST_DATA0 0x17C0 +#define HOST_DATA1 0x17C4 +#define HOST_DATA2 0x17C8 +#define HOST_DATA3 0x17CC +#define HOST_DATA4 0x17D0 +#define HOST_DATA5 0x17D4 +#define HOST_DATA6 0x17D8 #define HOST_DATA7 0x17DC #define HOST_DATA_LAST 0x17E0 #define DP_SRC_ENDIAN 0x15D4 -#define DP_SRC_FRGD_CLR 0x15D8 -#define DP_SRC_BKGD_CLR 0x15DC -#define SC_LEFT 0x1640 -#define SC_RIGHT 0x1644 -#define SC_TOP 0x1648 -#define SC_BOTTOM 0x164C -#define SRC_SC_RIGHT 0x1654 -#define SRC_SC_BOTTOM 0x165C -#define DP_CNTL 0x16C0 -#define DP_CNTL_XDIR_YDIR_YMAJOR 0x16D0 -#define DP_DATATYPE 0x16C4 -#define DP_MIX 0x16C8 -#define DP_WRITE_MSK 0x16CC -#define DP_XOP 0x17F8 +#define DP_SRC_FRGD_CLR 0x15D8 +#define DP_SRC_BKGD_CLR 0x15DC +#define SC_LEFT 0x1640 +#define SC_RIGHT 0x1644 +#define SC_TOP 0x1648 +#define SC_BOTTOM 0x164C +#define SRC_SC_RIGHT 0x1654 +#define SRC_SC_BOTTOM 0x165C +#define DP_CNTL 0x16C0 +#define DP_CNTL_XDIR_YDIR_YMAJOR 0x16D0 +#define DP_DATATYPE 0x16C4 +#define DP_MIX 0x16C8 +#define DP_WRITE_MSK 0x16CC +#define DP_XOP 0x17F8 #define CLR_CMP_CLR_SRC 0x15C4 #define CLR_CMP_CLR_DST 0x15C8 #define CLR_CMP_CNTL 0x15C0 -#define CLR_CMP_MSK 0x15CC -#define DSTCACHE_MODE 0x1710 -#define DSTCACHE_CTLSTAT 0x1714 -#define DEFAULT_PITCH_OFFSET 0x16E0 -#define DEFAULT_SC_BOTTOM_RIGHT 0x16E8 +#define CLR_CMP_MSK 0x15CC +#define DSTCACHE_MODE 0x1710 +#define DSTCACHE_CTLSTAT 0x1714 +#define DEFAULT_PITCH_OFFSET 0x16E0 +#define DEFAULT_SC_BOTTOM_RIGHT 0x16E8 #define DEFAULT_SC_TOP_LEFT 0x16EC #define SRC_PITCH_OFFSET 0x1428 #define DST_PITCH_OFFSET 0x142C -#define DP_GUI_MASTER_CNTL 0x146C -#define SC_TOP_LEFT 0x16EC -#define SC_BOTTOM_RIGHT 0x16F0 -#define SRC_SC_BOTTOM_RIGHT 0x16F4 +#define DP_GUI_MASTER_CNTL 0x146C +#define SC_TOP_LEFT 0x16EC +#define SC_BOTTOM_RIGHT 0x16F0 +#define SRC_SC_BOTTOM_RIGHT 0x16F4 #define RB2D_DSTCACHE_MODE 0x3428 #define RB2D_DSTCACHE_CTLSTAT_broken 0x342C /* do not use */ #define LVDS_GEN_CNTL 0x02d0 @@ -686,7 +686,7 @@ #define VERT_FP_LOOP_STRETCH (0x7 << 28) #define VERT_STRETCH_RESERVED 0xf1000000 -/* DAC_CNTL bit constants */ +/* DAC_CNTL bit constants */ #define DAC_8BIT_EN 0x00000100 #define DAC_4BPP_PIX_ORDER 0x00000200 #define DAC_CRC_EN 0x00080000 @@ -700,7 +700,7 @@ #define DAC_CMP_EN (1 << 3) #define DAC_CMP_OUTPUT (1 << 7) -/* DAC_CNTL2 bit constants */ +/* DAC_CNTL2 bit constants */ #define DAC2_EXPAND_MODE (1 << 14) #define DAC2_CMP_EN (1 << 7) #define DAC2_PALETTE_ACCESS_CNTL (1 << 5) From 75ed3a17a5bc0ecff5c256cfb81ed06f8a6fbb54 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:03 -0700 Subject: [PATCH 417/478] cirrusfb: convert printks to dev_foo Convert all printks to dev_dbg, dev_info or dev_err. Kill some excessive debug information and code in the process. [akpm@linux-foundation.org: printk fixes] [akpm@linux-foundation.org: cleanups] Signed-off-by: Krzysztof Helt Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 455 ++++++++++++++------------------------- 1 file changed, 158 insertions(+), 297 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index a2aa6ddffbe2..3b4b0f1e0615 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -34,8 +34,6 @@ * */ -#define CIRRUSFB_VERSION "2.0-pre2" - #include #include #include @@ -72,20 +70,9 @@ * */ -/* enable debug output? */ -/* #define CIRRUSFB_DEBUG 1 */ - /* disable runtime assertions? */ /* #define CIRRUSFB_NDEBUG */ -/* debug output */ -#ifdef CIRRUSFB_DEBUG -#define DPRINTK(fmt, args...) \ - printk(KERN_DEBUG "%s: " fmt, __func__ , ## args) -#else -#define DPRINTK(fmt, args...) -#endif - /* debugging assertions */ #ifndef CIRRUSFB_NDEBUG #define assert(expr) \ @@ -150,7 +137,7 @@ static const struct cirrusfb_board_info_rec { .maxclock = { /* guess */ /* the SD64/P4 have a higher max. videoclock */ - 140000, 140000, 140000, 140000, 140000, + 135100, 135100, 85500, 85500, 0 }, .init_sr07 = true, .init_sr1f = true, @@ -426,11 +413,10 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, static void bestclock(long freq, int *nom, int *den, int *div); #ifdef CIRRUSFB_DEBUG -static void cirrusfb_dump(void); -static void cirrusfb_dbg_reg_dump(caddr_t regbase); -static void cirrusfb_dbg_print_regs(caddr_t regbase, +static void cirrusfb_dbg_reg_dump(struct fb_info *info, caddr_t regbase); +static void cirrusfb_dbg_print_regs(struct fb_info *info, + caddr_t regbase, enum cirrusfb_dbg_reg_class reg_class, ...); -static void cirrusfb_dbg_print_byte(const char *name, unsigned char val); #endif /* CIRRUSFB_DEBUG */ /*** END PROTOTYPES ********************************************************/ @@ -460,23 +446,24 @@ static int cirrusfb_release(struct fb_info *info, int user) /**** BEGIN Hardware specific Routines **************************************/ /* Check if the MCLK is not a better clock source */ -static int cirrusfb_check_mclk(struct cirrusfb_info *cinfo, long freq) +static int cirrusfb_check_mclk(struct fb_info *info, long freq) { + struct cirrusfb_info *cinfo = info->par; long mclk = vga_rseq(cinfo->regbase, CL_SEQR1F) & 0x3f; /* Read MCLK value */ mclk = (14318 * mclk) >> 3; - DPRINTK("Read MCLK of %ld kHz\n", mclk); + dev_dbg(info->device, "Read MCLK of %ld kHz\n", mclk); /* Determine if we should use MCLK instead of VCLK, and if so, what we * should divide it by to get VCLK */ if (abs(freq - mclk) < 250) { - DPRINTK("Using VCLK = MCLK\n"); + dev_dbg(info->device, "Using VCLK = MCLK\n"); return 1; } else if (abs(freq - (mclk / 2)) < 250) { - DPRINTK("Using VCLK = MCLK/2\n"); + dev_dbg(info->device, "Using VCLK = MCLK/2\n"); return 2; } @@ -490,56 +477,6 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, /* memory size in pixels */ unsigned pixels = info->screen_size * 8 / var->bits_per_pixel; - switch (var->bits_per_pixel) { - case 1: - pixels /= 4; - break; /* 8 pixel per byte, only 1/4th of mem usable */ - case 8: - case 16: - case 32: - break; /* 1 pixel == 1 byte */ - default: - printk(KERN_ERR "cirrusfb: mode %dx%dx%d rejected..." - "color depth not supported.\n", - var->xres, var->yres, var->bits_per_pixel); - DPRINTK("EXIT - EINVAL error\n"); - return -EINVAL; - } - - if (var->xres_virtual < var->xres) - var->xres_virtual = var->xres; - /* use highest possible virtual resolution */ - if (var->yres_virtual == -1) { - var->yres_virtual = pixels / var->xres_virtual; - - printk(KERN_INFO "cirrusfb: virtual resolution set to " - "maximum of %dx%d\n", var->xres_virtual, - var->yres_virtual); - } - if (var->yres_virtual < var->yres) - var->yres_virtual = var->yres; - - if (var->xres_virtual * var->yres_virtual > pixels) { - printk(KERN_ERR "cirrusfb: mode %dx%dx%d rejected... " - "virtual resolution too high to fit into video memory!\n", - var->xres_virtual, var->yres_virtual, - var->bits_per_pixel); - DPRINTK("EXIT - EINVAL error\n"); - return -EINVAL; - } - - - if (var->xoffset < 0) - var->xoffset = 0; - if (var->yoffset < 0) - var->yoffset = 0; - - /* truncate xoffset and yoffset to maximum if too high */ - if (var->xoffset > var->xres_virtual - var->xres) - var->xoffset = var->xres_virtual - var->xres - 1; - if (var->yoffset > var->yres_virtual - var->yres) - var->yoffset = var->yres_virtual - var->yres - 1; - switch (var->bits_per_pixel) { case 1: var->red.offset = 0; @@ -586,12 +523,46 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, break; default: - DPRINTK("Unsupported bpp size: %d\n", var->bits_per_pixel); + dev_dbg(info->device, + "Unsupported bpp size: %d\n", var->bits_per_pixel); assert(false); /* should never occur */ break; } + if (var->xres_virtual < var->xres) + var->xres_virtual = var->xres; + /* use highest possible virtual resolution */ + if (var->yres_virtual == -1) { + var->yres_virtual = pixels / var->xres_virtual; + + dev_info(info->device, + "virtual resolution set to maximum of %dx%d\n", + var->xres_virtual, var->yres_virtual); + } + if (var->yres_virtual < var->yres) + var->yres_virtual = var->yres; + + if (var->xres_virtual * var->yres_virtual > pixels) { + dev_err(info->device, "mode %dx%dx%d rejected... " + "virtual resolution too high to fit into video memory!\n", + var->xres_virtual, var->yres_virtual, + var->bits_per_pixel); + return -EINVAL; + } + + + if (var->xoffset < 0) + var->xoffset = 0; + if (var->yoffset < 0) + var->yoffset = 0; + + /* truncate xoffset and yoffset to maximum if too high */ + if (var->xoffset > var->xres_virtual - var->xres) + var->xoffset = var->xres_virtual - var->xres - 1; + if (var->yoffset > var->yres_virtual - var->yres) + var->yoffset = var->yres_virtual - var->yres - 1; + var->red.msb_right = var->green.msb_right = var->blue.msb_right = @@ -606,9 +577,8 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, yres = (yres + 1) / 2; if (yres >= 1280) { - printk(KERN_ERR "cirrusfb: ERROR: VerticalTotal >= 1280; " + dev_err(info->device, "ERROR: VerticalTotal >= 1280; " "special treatment required! (TODO)\n"); - DPRINTK("EXIT - EINVAL error\n"); return -EINVAL; } @@ -642,7 +612,8 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, break; default: - DPRINTK("Unsupported bpp size: %d\n", var->bits_per_pixel); + dev_dbg(info->device, + "Unsupported bpp size: %d\n", var->bits_per_pixel); assert(false); /* should never occur */ break; @@ -653,7 +624,7 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, /* convert from ps to kHz */ freq = PICOS2KHZ(var->pixclock); - DPRINTK("desired pixclock: %ld kHz\n", freq); + dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq); maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx]; regs->multiplexing = 0; @@ -668,9 +639,9 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, break; default: - printk(KERN_ERR "cirrusfb: Frequency greater " - "than maxclock (%ld kHz)\n", maxclock); - DPRINTK("EXIT - return -EINVAL\n"); + dev_err(info->device, + "Frequency greater than maxclock (%ld kHz)\n", + maxclock); return -EINVAL; } } @@ -689,16 +660,17 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, return 0; } -static void cirrusfb_set_mclk_as_source(const struct cirrusfb_info *cinfo, - int div) +static void cirrusfb_set_mclk_as_source(const struct fb_info *info, int div) { + struct cirrusfb_info *cinfo = info->par; unsigned char old1f, old1e; + assert(cinfo != NULL); old1f = vga_rseq(cinfo->regbase, CL_SEQR1F) & ~0x40; if (div) { - DPRINTK("Set %s as pixclock source.\n", - (div == 2) ? "MCLK/2" : "MCLK"); + dev_dbg(info->device, "Set %s as pixclock source.\n", + (div == 2) ? "MCLK/2" : "MCLK"); old1f |= 0x40; old1e = vga_rseq(cinfo->regbase, CL_SEQR1E) & ~0x1; if (div == 2) @@ -728,17 +700,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info) long freq; int nom, den, div; - DPRINTK("ENTER\n"); - DPRINTK("Requested mode: %dx%dx%d\n", + dev_dbg(info->device, "Requested mode: %dx%dx%d\n", var->xres, var->yres, var->bits_per_pixel); - DPRINTK("pixclock: %d\n", var->pixclock); + dev_dbg(info->device, "pixclock: %d\n", var->pixclock); init_vgachip(info); err = cirrusfb_decode_var(var, ®s, info); if (err) { /* should never happen */ - DPRINTK("mode change aborted. invalid var.\n"); + dev_dbg(info->device, "mode change aborted. invalid var.\n"); return -EINVAL; } @@ -789,30 +760,30 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, 0x20); /* previously: 0x00) */ /* if debugging is enabled, all parameters get output before writing */ - DPRINTK("CRT0: %d\n", htotal); + dev_dbg(info->device, "CRT0: %d\n", htotal); vga_wcrt(regbase, VGA_CRTC_H_TOTAL, htotal); - DPRINTK("CRT1: %d\n", hdispend); + dev_dbg(info->device, "CRT1: %d\n", hdispend); vga_wcrt(regbase, VGA_CRTC_H_DISP, hdispend); - DPRINTK("CRT2: %d\n", var->xres / 8); + dev_dbg(info->device, "CRT2: %d\n", var->xres / 8); vga_wcrt(regbase, VGA_CRTC_H_BLANK_START, var->xres / 8); /* + 128: Compatible read */ - DPRINTK("CRT3: 128+%d\n", (htotal + 5) % 32); + dev_dbg(info->device, "CRT3: 128+%d\n", (htotal + 5) % 32); vga_wcrt(regbase, VGA_CRTC_H_BLANK_END, 128 + ((htotal + 5) % 32)); - DPRINTK("CRT4: %d\n", hsyncstart); + dev_dbg(info->device, "CRT4: %d\n", hsyncstart); vga_wcrt(regbase, VGA_CRTC_H_SYNC_START, hsyncstart); tmp = hsyncend % 32; if ((htotal + 5) & 32) tmp += 128; - DPRINTK("CRT5: %d\n", tmp); + dev_dbg(info->device, "CRT5: %d\n", tmp); vga_wcrt(regbase, VGA_CRTC_H_SYNC_END, tmp); - DPRINTK("CRT6: %d\n", vtotal & 0xff); + dev_dbg(info->device, "CRT6: %d\n", vtotal & 0xff); vga_wcrt(regbase, VGA_CRTC_V_TOTAL, vtotal & 0xff); tmp = 16; /* LineCompare bit #9 */ @@ -830,7 +801,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) tmp |= 64; if (vsyncstart & 512) tmp |= 128; - DPRINTK("CRT7: %d\n", tmp); + dev_dbg(info->device, "CRT7: %d\n", tmp); vga_wcrt(regbase, VGA_CRTC_OVERFLOW, tmp); tmp = 0x40; /* LineCompare bit #8 */ @@ -838,25 +809,25 @@ static int cirrusfb_set_par_foo(struct fb_info *info) tmp |= 0x20; if (var->vmode & FB_VMODE_DOUBLE) tmp |= 0x80; - DPRINTK("CRT9: %d\n", tmp); + dev_dbg(info->device, "CRT9: %d\n", tmp); vga_wcrt(regbase, VGA_CRTC_MAX_SCAN, tmp); - DPRINTK("CRT10: %d\n", vsyncstart & 0xff); + dev_dbg(info->device, "CRT10: %d\n", vsyncstart & 0xff); vga_wcrt(regbase, VGA_CRTC_V_SYNC_START, vsyncstart & 0xff); - DPRINTK("CRT11: 64+32+%d\n", vsyncend % 16); + dev_dbg(info->device, "CRT11: 64+32+%d\n", vsyncend % 16); vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, vsyncend % 16 + 64 + 32); - DPRINTK("CRT12: %d\n", vdispend & 0xff); + dev_dbg(info->device, "CRT12: %d\n", vdispend & 0xff); vga_wcrt(regbase, VGA_CRTC_V_DISP_END, vdispend & 0xff); - DPRINTK("CRT15: %d\n", (vdispend + 1) & 0xff); + dev_dbg(info->device, "CRT15: %d\n", (vdispend + 1) & 0xff); vga_wcrt(regbase, VGA_CRTC_V_BLANK_START, (vdispend + 1) & 0xff); - DPRINTK("CRT16: %d\n", vtotal & 0xff); + dev_dbg(info->device, "CRT16: %d\n", vtotal & 0xff); vga_wcrt(regbase, VGA_CRTC_V_BLANK_END, vtotal & 0xff); - DPRINTK("CRT18: 0xff\n"); + dev_dbg(info->device, "CRT18: 0xff\n"); vga_wcrt(regbase, VGA_CRTC_LINE_COMPARE, 0xff); tmp = 0; @@ -871,12 +842,15 @@ static int cirrusfb_set_par_foo(struct fb_info *info) if (vtotal & 512) tmp |= 128; - DPRINTK("CRT1a: %d\n", tmp); + dev_dbg(info->device, "CRT1a: %d\n", tmp); vga_wcrt(regbase, CL_CRT1A, tmp); freq = PICOS2KHZ(var->pixclock); bestclock(freq, &nom, &den, &div); + dev_dbg(info->device, "VCLK freq: %ld kHz nom: %d den: %d div: %d\n", + freq, nom, den, div); + /* set VCLK0 */ /* hardware RefClock: 14.31818 MHz */ /* formula: VClk = (OSC * N) / (D * (1+P)) */ @@ -886,10 +860,10 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* if freq is close to mclk or mclk/2 select mclk * as clock source */ - int divMCLK = cirrusfb_check_mclk(cinfo, freq); + int divMCLK = cirrusfb_check_mclk(info, freq); if (divMCLK) { nom = 0; - cirrusfb_set_mclk_as_source(cinfo, divMCLK); + cirrusfb_set_mclk_as_source(info, divMCLK); } } if (nom) { @@ -904,7 +878,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) (cinfo->btype == BT_GD5480)) tmp |= 0x80; - DPRINTK("CL_SEQR1B: %ld\n", (long) tmp); + dev_dbg(info->device, "CL_SEQR1B: %ld\n", (long) tmp); vga_wseq(regbase, CL_SEQR1B, tmp); } @@ -952,7 +926,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* programming for different color depths */ if (var->bits_per_pixel == 1) { - DPRINTK("cirrusfb: preparing for 1 bit deep display\n"); + dev_dbg(info->device, "preparing for 1 bit deep display\n"); vga_wgfx(regbase, VGA_GFX_MODE, 0); /* mode register */ /* SR07 */ @@ -964,20 +938,18 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_PICASSO4: case BT_ALPINE: case BT_GD5480: - DPRINTK(" (for GD54xx)\n"); vga_wseq(regbase, CL_SEQR7, - regs.multiplexing ? + regs.multiplexing ? bi->sr07_1bpp_mux : bi->sr07_1bpp); break; case BT_LAGUNA: - DPRINTK(" (for GD546x)\n"); vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); break; default: - printk(KERN_WARNING "cirrusfb: unknown Board\n"); + dev_warn(info->device, "unknown Board\n"); break; } @@ -987,14 +959,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* setting the SEQRF on SD64 is not necessary * (only during init) */ - DPRINTK("(for SD64)\n"); /* MCLK select */ vga_wseq(regbase, CL_SEQR1F, 0x1a); break; case BT_PICCOLO: case BT_SPECTRUM: - DPRINTK("(for Piccolo/Spectrum)\n"); /* ### ueberall 0x22? */ /* ##vorher 1c MCLK select */ vga_wseq(regbase, CL_SEQR1F, 0x22); @@ -1003,7 +973,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_PICASSO: - DPRINTK("(for Picasso)\n"); /* ##vorher 22 MCLK select */ vga_wseq(regbase, CL_SEQR1F, 0x22); /* ## vorher d0 avoid FIFO underruns..? */ @@ -1014,12 +983,11 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_ALPINE: case BT_GD5480: case BT_LAGUNA: - DPRINTK(" (for GD54xx)\n"); /* do nothing */ break; default: - printk(KERN_WARNING "cirrusfb: unknown Board\n"); + dev_warn(info->device, "unknown Board\n"); break; } @@ -1045,7 +1013,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) */ else if (var->bits_per_pixel == 8) { - DPRINTK("cirrusfb: preparing for 8 bit deep display\n"); + dev_dbg(info->device, "preparing for 8 bit deep display\n"); switch (cinfo->btype) { case BT_SD64: case BT_PICCOLO: @@ -1054,20 +1022,18 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_PICASSO4: case BT_ALPINE: case BT_GD5480: - DPRINTK(" (for GD54xx)\n"); vga_wseq(regbase, CL_SEQR7, regs.multiplexing ? bi->sr07_8bpp_mux : bi->sr07_8bpp); break; case BT_LAGUNA: - DPRINTK(" (for GD546x)\n"); vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) | 0x01); break; default: - printk(KERN_WARNING "cirrusfb: unknown Board\n"); + dev_warn(info->device, "unknown Board\n"); break; } @@ -1095,18 +1061,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_ALPINE: - DPRINTK(" (for GD543x)\n"); /* We already set SRF and SR1F */ break; case BT_GD5480: case BT_LAGUNA: - DPRINTK(" (for GD54xx)\n"); /* do nothing */ break; default: - printk(KERN_WARNING "cirrusfb: unknown Board\n"); + dev_warn(info->device, "unknown board\n"); break; } @@ -1134,7 +1098,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) */ else if (var->bits_per_pixel == 16) { - DPRINTK("cirrusfb: preparing for 16 bit deep display\n"); + dev_dbg(info->device, "preparing for 16 bit deep display\n"); switch (cinfo->btype) { case BT_SD64: /* Extended Sequencer Mode: 256c col. mode */ @@ -1166,24 +1130,21 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_ALPINE: - DPRINTK(" (for GD543x)\n"); vga_wseq(regbase, CL_SEQR7, 0xa7); break; case BT_GD5480: - DPRINTK(" (for GD5480)\n"); vga_wseq(regbase, CL_SEQR7, 0x17); /* We already set SRF and SR1F */ break; case BT_LAGUNA: - DPRINTK(" (for GD546x)\n"); vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); break; default: - printk(KERN_WARNING "CIRRUSFB: unknown Board\n"); + dev_warn(info->device, "unknown Board\n"); break; } @@ -1211,7 +1172,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) */ else if (var->bits_per_pixel == 32) { - DPRINTK("cirrusfb: preparing for 32 bit deep display\n"); + dev_dbg(info->device, "preparing for 32 bit deep display\n"); switch (cinfo->btype) { case BT_SD64: /* Extended Sequencer Mode: 256c col. mode */ @@ -1243,24 +1204,21 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_ALPINE: - DPRINTK(" (for GD543x)\n"); vga_wseq(regbase, CL_SEQR7, 0xa9); break; case BT_GD5480: - DPRINTK(" (for GD5480)\n"); vga_wseq(regbase, CL_SEQR7, 0x19); /* We already set SRF and SR1F */ break; case BT_LAGUNA: - DPRINTK(" (for GD546x)\n"); vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); break; default: - printk(KERN_WARNING "cirrusfb: unknown Board\n"); + dev_warn(info->device, "unknown Board\n"); break; } @@ -1284,8 +1242,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) */ else - printk(KERN_ERR "cirrusfb: What's this?? " - " requested color depth == %d.\n", + dev_err(info->device, + "What's this? requested color depth == %d.\n", var->bits_per_pixel); vga_wcrt(regbase, VGA_CRTC_OFFSET, offset & 0xff); @@ -1355,7 +1313,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) */ vga_wseq(regbase, VGA_SEQ_CLOCK_MODE, tmp); - DPRINTK("CL_SEQR1: %d\n", tmp); + dev_dbg(info->device, "CL_SEQR1: %d\n", tmp); cinfo->currentmode = regs; @@ -1363,10 +1321,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info) cirrusfb_pan_display(var, info); #ifdef CIRRUSFB_DEBUG - cirrusfb_dump(); + cirrusfb_dbg_reg_dump(info, NULL); #endif - DPRINTK("EXIT\n"); return 0; } @@ -1424,8 +1381,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, unsigned char tmp = 0, tmp2 = 0, xpix; struct cirrusfb_info *cinfo = info->par; - DPRINTK("ENTER\n"); - DPRINTK("virtual offset: (%d,%d)\n", var->xoffset, var->yoffset); + dev_dbg(info->device, + "virtual offset: (%d,%d)\n", var->xoffset, var->yoffset); /* no range checks for xoffset and yoffset, */ /* as fb_pan_display has already done this */ @@ -1481,7 +1438,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, cirrusfb_WaitBLT(cinfo->regbase); - DPRINTK("EXIT\n"); return 0; } @@ -1502,11 +1458,11 @@ static int cirrusfb_blank(int blank_mode, struct fb_info *info) struct cirrusfb_info *cinfo = info->par; int current_mode = cinfo->blank_mode; - DPRINTK("ENTER, blank mode = %d\n", blank_mode); + dev_dbg(info->device, "ENTER, blank mode = %d\n", blank_mode); if (info->state != FBINFO_STATE_RUNNING || current_mode == blank_mode) { - DPRINTK("EXIT, returning 0\n"); + dev_dbg(info->device, "EXIT, returning 0\n"); return 0; } @@ -1543,12 +1499,12 @@ static int cirrusfb_blank(int blank_mode, struct fb_info *info) vga_wgfx(cinfo->regbase, CL_GRE, 0x06); break; default: - DPRINTK("EXIT, returning 1\n"); + dev_dbg(info->device, "EXIT, returning 1\n"); return 1; } cinfo->blank_mode = blank_mode; - DPRINTK("EXIT, returning 0\n"); + dev_dbg(info->device, "EXIT, returning 0\n"); /* Let fbcon do a soft blank for us */ return (blank_mode == FB_BLANK_NORMAL) ? 1 : 0; @@ -1562,8 +1518,6 @@ static void init_vgachip(struct fb_info *info) struct cirrusfb_info *cinfo = info->par; const struct cirrusfb_board_info_rec *bi; - DPRINTK("ENTER\n"); - assert(cinfo != NULL); bi = &cirrusfb_board_info[cinfo->btype]; @@ -1609,7 +1563,7 @@ static void init_vgachip(struct fb_info *info) break; default: - printk(KERN_ERR "cirrusfb: Warning: Unknown board type\n"); + dev_err(info->device, "Warning: Unknown board type\n"); break; } @@ -1798,8 +1752,6 @@ static void init_vgachip(struct fb_info *info) /* misc... */ WHDR(cinfo, 0); /* Hidden DAC register: - */ - - DPRINTK("EXIT\n"); return; } @@ -1808,8 +1760,6 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on) #ifdef CONFIG_ZORRO /* only works on Zorro boards */ static int IsOn = 0; /* XXX not ok for multiple boards */ - DPRINTK("ENTER\n"); - if (cinfo->btype == BT_PICASSO4) return; /* nothing to switch */ if (cinfo->btype == BT_ALPINE) @@ -1819,8 +1769,6 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on) if (cinfo->btype == BT_PICASSO) { if ((on && !IsOn) || (!on && IsOn)) WSFR(cinfo, 0xff); - - DPRINTK("EXIT\n"); return; } if (on) { @@ -1847,11 +1795,10 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on) case BT_SPECTRUM: WSFR(cinfo, 0x4f); break; - default: /* do nothing */ break; + default: /* do nothing */ + break; } } - - DPRINTK("EXIT\n"); #endif /* CONFIG_ZORRO */ } @@ -1953,12 +1900,8 @@ static void cirrusfb_imageblit(struct fb_info *info, #define PREP_IO_BASE ((volatile unsigned char *) 0x80000000) static void get_prep_addrs(unsigned long *display, unsigned long *registers) { - DPRINTK("ENTER\n"); - *display = PREP_VIDEO_BASE; *registers = (unsigned long) PREP_IO_BASE; - - DPRINTK("EXIT\n"); } #endif /* CONFIG_PPC_PREP */ @@ -1970,13 +1913,12 @@ static int release_io_ports; * based on the DRAM bandwidth bit and DRAM bank switching bit. This * works with 1MB, 2MB and 4MB configurations (which the Motorola boards * seem to have. */ -static unsigned int __devinit cirrusfb_get_memsize(u8 __iomem *regbase) +static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info, + u8 __iomem *regbase) { unsigned long mem; unsigned char SRF; - DPRINTK("ENTER\n"); - SRF = vga_rseq(regbase, CL_SEQRF); switch ((SRF & 0x18)) { case 0x08: @@ -1992,7 +1934,7 @@ static unsigned int __devinit cirrusfb_get_memsize(u8 __iomem *regbase) mem = 2048 * 1024; break; default: - printk(KERN_WARNING "CLgenfb: Unknown memory size!\n"); + dev_warn(info->device, "CLgenfb: Unknown memory size!\n"); mem = 1024 * 1024; } if (SRF & 0x80) @@ -2002,8 +1944,6 @@ static unsigned int __devinit cirrusfb_get_memsize(u8 __iomem *regbase) mem *= 2; /* TODO: Handling of GD5446/5480 (see XF86 sources ...) */ - - DPRINTK("EXIT\n"); return mem; } @@ -2014,8 +1954,6 @@ static void get_pci_addrs(const struct pci_dev *pdev, assert(display != NULL); assert(registers != NULL); - DPRINTK("ENTER\n"); - *display = 0; *registers = 0; @@ -2030,8 +1968,6 @@ static void get_pci_addrs(const struct pci_dev *pdev, } assert(*display != 0); - - DPRINTK("EXIT\n"); } static void cirrusfb_pci_unmap(struct fb_info *info) @@ -2117,11 +2053,6 @@ static int __devinit cirrusfb_register(struct fb_info *info) int err; enum cirrus_board btype; - DPRINTK("ENTER\n"); - - printk(KERN_INFO "cirrusfb: Driver for Cirrus Logic based " - "graphic boards, v" CIRRUSFB_VERSION "\n"); - btype = cinfo->btype; /* sanity checks */ @@ -2130,11 +2061,11 @@ static int __devinit cirrusfb_register(struct fb_info *info) /* set all the vital stuff */ cirrusfb_set_fbinfo(info); - DPRINTK("cirrusfb: (RAM start set to: 0x%p)\n", info->screen_base); + dev_dbg(info->device, "(RAM start set to: 0x%p)\n", info->screen_base); err = fb_find_mode(&info->var, info, mode_option, NULL, 0, NULL, 8); if (!err) { - DPRINTK("wrong initial video mode\n"); + dev_dbg(info->device, "wrong initial video mode\n"); err = -EINVAL; goto err_dealloc_cmap; } @@ -2144,18 +2075,18 @@ static int __devinit cirrusfb_register(struct fb_info *info) err = cirrusfb_decode_var(&info->var, &cinfo->currentmode, info); if (err < 0) { /* should never happen */ - DPRINTK("choking on default var... umm, no good.\n"); + dev_dbg(info->device, + "choking on default var... umm, no good.\n"); goto err_dealloc_cmap; } err = register_framebuffer(info); if (err < 0) { - printk(KERN_ERR "cirrusfb: could not register " - "fb device; err = %d!\n", err); + dev_err(info->device, + "could not register fb device; err = %d!\n", err); goto err_dealloc_cmap; } - DPRINTK("EXIT, returning 0\n"); return 0; err_dealloc_cmap: @@ -2168,17 +2099,13 @@ err_dealloc_cmap: static void __devexit cirrusfb_cleanup(struct fb_info *info) { struct cirrusfb_info *cinfo = info->par; - DPRINTK("ENTER\n"); switch_monitor(cinfo, 0); - unregister_framebuffer(info); fb_dealloc_cmap(&info->cmap); - printk("Framebuffer unregistered\n"); + dev_dbg(info->device, "Framebuffer unregistered\n"); cinfo->unmap(info); framebuffer_release(info); - - DPRINTK("EXIT\n"); } #ifdef CONFIG_PCI @@ -2207,9 +2134,11 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, cinfo = info->par; cinfo->btype = btype = (enum cirrus_board) ent->driver_data; - DPRINTK(" Found PCI device, base address 0 is 0x%x, btype set to %d\n", - pdev->resource[0].start, btype); - DPRINTK(" base address 1 is 0x%x\n", pdev->resource[1].start); + dev_dbg(info->device, + " Found PCI device, base address 0 is 0x%Lx, btype set to %d\n", + (unsigned long long)pdev->resource[0].start, btype); + dev_dbg(info->device, " base address 1 is 0x%Lx\n", + (unsigned long long)pdev->resource[1].start); if (isPReP) { pci_write_config_dword(pdev, PCI_BASE_ADDRESS_0, 0x00000000); @@ -2219,30 +2148,29 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, /* PReP dies if we ioremap the IO registers, but it works w/out... */ cinfo->regbase = (char __iomem *) info->fix.mmio_start; } else { - DPRINTK("Attempt to get PCI info for Cirrus Graphics Card\n"); + dev_dbg(info->device, + "Attempt to get PCI info for Cirrus Graphics Card\n"); get_pci_addrs(pdev, &board_addr, &info->fix.mmio_start); /* FIXME: this forces VGA. alternatives? */ cinfo->regbase = NULL; } - DPRINTK("Board address: 0x%lx, register address: 0x%lx\n", + dev_dbg(info->device, "Board address: 0x%lx, register address: 0x%lx\n", board_addr, info->fix.mmio_start); board_size = (btype == BT_GD5480) ? - 32 * MB_ : cirrusfb_get_memsize(cinfo->regbase); + 32 * MB_ : cirrusfb_get_memsize(info, cinfo->regbase); ret = pci_request_regions(pdev, "cirrusfb"); if (ret < 0) { - printk(KERN_ERR "cirrusfb: cannot reserve region 0x%lx, " - "abort\n", - board_addr); + dev_err(info->device, "cannot reserve region 0x%lx, abort\n", + board_addr); goto err_release_fb; } #if 0 /* if the system didn't claim this region, we would... */ if (!request_mem_region(0xA0000, 65535, "cirrusfb")) { - printk(KERN_ERR "cirrusfb: cannot reserve region 0x%lx, abort\n" -, - 0xA0000L); + dev_err(info->device, "cannot reserve region 0x%lx, abort\n", + 0xA0000L); ret = -EBUSY; goto err_release_regions; } @@ -2260,9 +2188,9 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, info->screen_size = board_size; cinfo->unmap = cirrusfb_pci_unmap; - printk(KERN_INFO "RAM (%lu kB) at 0x%lx, Cirrus " - "Logic chipset on PCI bus\n", - info->screen_size >> 10, board_addr); + dev_info(info->device, + "Cirrus Logic chipset on PCI bus, RAM (%lu kB) at 0x%lx\n", + info->screen_size >> 10, board_addr); pci_set_drvdata(pdev, info); ret = cirrusfb_register(info); @@ -2288,11 +2216,8 @@ err_out: static void __devexit cirrusfb_pci_unregister(struct pci_dev *pdev) { struct fb_info *info = pci_get_drvdata(pdev); - DPRINTK("ENTER\n"); cirrusfb_cleanup(info); - - DPRINTK("EXIT\n"); } static struct pci_driver cirrusfb_pci_driver = { @@ -2324,8 +2249,6 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, if (cirrusfb_zorro_table2[btype].id2) z2 = zorro_find_device(cirrusfb_zorro_table2[btype].id2, NULL); size = cirrusfb_zorro_table2[btype].size; - printk(KERN_INFO "cirrusfb: %s board detected; ", - cirrusfb_board_info[btype].name); info = framebuffer_alloc(sizeof(struct cirrusfb_info), &z->dev); if (!info) { @@ -2334,6 +2257,9 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, goto err_out; } + dev_info(info->device, "%s board detected\n", + cirrusfb_board_info[btype].name); + cinfo = info->par; cinfo->btype = btype; @@ -2345,19 +2271,16 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, info->screen_size = size; if (!zorro_request_device(z, "cirrusfb")) { - printk(KERN_ERR "cirrusfb: cannot reserve region 0x%lx, " - "abort\n", - board_addr); + dev_err(info->device, "cannot reserve region 0x%lx, abort\n", + board_addr); ret = -EBUSY; goto err_release_fb; } - printk(" RAM (%lu MB) at $%lx, ", board_size / MB_, board_addr); - ret = -EIO; if (btype == BT_PICASSO4) { - printk(KERN_INFO " REG at $%lx\n", board_addr + 0x600000); + dev_info(info->device, " REG at $%lx\n", board_addr + 0x600000); /* To be precise, for the P4 this is not the */ /* begin of the board, but the begin of RAM. */ @@ -2367,7 +2290,7 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, if (!cinfo->regbase) goto err_release_region; - DPRINTK("cirrusfb: Virtual address for board set to: $%p\n", + dev_dbg(info->device, "Virtual address for board set to: $%p\n", cinfo->regbase); cinfo->regbase += 0x600000; info->fix.mmio_start = board_addr + 0x600000; @@ -2377,8 +2300,8 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, if (!info->screen_base) goto err_unmap_regbase; } else { - printk(KERN_INFO " REG at $%lx\n", - (unsigned long) z2->resource.start); + dev_info(info->device, " REG at $%lx\n", + (unsigned long) z2->resource.start); info->fix.smem_start = board_addr; if (board_addr > 0x01000000) @@ -2392,12 +2315,15 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, cinfo->regbase = (caddr_t) ZTWO_VADDR(z2->resource.start); info->fix.mmio_start = z2->resource.start; - DPRINTK("cirrusfb: Virtual address for board set to: $%p\n", + dev_dbg(info->device, "Virtual address for board set to: $%p\n", cinfo->regbase); } cinfo->unmap = cirrusfb_zorro_unmap; - printk(KERN_INFO "Cirrus Logic chipset on Zorro bus\n"); + dev_info(info->device, + "Cirrus Logic chipset on Zorro bus, RAM (%lu MB) at $%lx\n", + board_size / MB_, board_addr); + zorro_set_drvdata(z, info); ret = cirrusfb_register(info); @@ -2424,11 +2350,8 @@ err_out: void __devexit cirrusfb_zorro_unregister(struct zorro_dev *z) { struct fb_info *info = zorro_get_drvdata(z); - DPRINTK("ENTER\n"); cirrusfb_cleanup(info); - - DPRINTK("EXIT\n"); } static struct zorro_driver cirrusfb_zorro_driver = { @@ -2461,11 +2384,10 @@ static int __init cirrusfb_init(void) } #ifndef MODULE -static int __init cirrusfb_setup(char *options) { +static int __init cirrusfb_setup(char *options) +{ char *this_opt; - DPRINTK("ENTER\n"); - if (!options || !*options) return 0; @@ -2473,8 +2395,6 @@ static int __init cirrusfb_setup(char *options) { if (!*this_opt) continue; - DPRINTK("cirrusfb_setup: option '%s'\n", this_opt); - if (!strcmp(this_opt, "noaccel")) noaccel = 1; else if (!strncmp(this_opt, "mode:", 5)) @@ -2560,8 +2480,6 @@ static void AttrOn(const struct cirrusfb_info *cinfo) { assert(cinfo != NULL); - DPRINTK("ENTER\n"); - if (vga_rcrt(cinfo->regbase, CL_CRT24) & 0x80) { /* if we're just in "write value" mode, write back the */ /* same value as before to not modify anything */ @@ -2574,8 +2492,6 @@ static void AttrOn(const struct cirrusfb_info *cinfo) /* dummy write on Reg0 to be on "write index" mode next time */ vga_w(cinfo->regbase, VGA_ATT_IW, 0x00); - - DPRINTK("EXIT\n"); } /*** WHDR() - write into the Hidden DAC register ***/ @@ -2723,8 +2639,6 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, u_long nsrc, ndest; u_char bltmode; - DPRINTK("ENTER\n"); - nwidth = width - 1; nheight = height - 1; @@ -2813,8 +2727,6 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, /* and finally: GO! */ vga_wgfx(regbase, CL_GR31, 0x02); /* BLT Start/status */ - - DPRINTK("EXIT\n"); } /******************************************************************* @@ -2831,8 +2743,6 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, u_long ndest; u_char op; - DPRINTK("ENTER\n"); - nwidth = width - 1; nheight = height - 1; @@ -2896,8 +2806,6 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, /* and finally: GO! */ vga_wgfx(regbase, CL_GR31, 0x02); /* BLT Start/status */ - - DPRINTK("EXIT\n"); } /************************************************************************** @@ -2917,8 +2825,6 @@ static void bestclock(long freq, int *nom, int *den, int *div) *den = 0; *div = 0; - DPRINTK("ENTER\n"); - if (freq < 8000) freq = 8000; @@ -2960,12 +2866,6 @@ static void bestclock(long freq, int *nom, int *den, int *div) } } } - - DPRINTK("Best possible values for given frequency:\n"); - DPRINTK(" freq: %ld kHz nom: %d den: %d div: %d\n", - freq, *nom, *den, *div); - - DPRINTK("EXIT\n"); } /* ------------------------------------------------------------------------- @@ -2977,32 +2877,6 @@ static void bestclock(long freq, int *nom, int *den, int *div) #ifdef CIRRUSFB_DEBUG -/** - * cirrusfb_dbg_print_byte - * @name: name associated with byte value to be displayed - * @val: byte value to be displayed - * - * DESCRIPTION: - * Display an indented string, along with a hexidecimal byte value, and - * its decoded bits. Bits 7 through 0 are listed in left-to-right - * order. - */ - -static -void cirrusfb_dbg_print_byte(const char *name, unsigned char val) -{ - DPRINTK("%8s = 0x%02X (bits 7-0: %c%c%c%c%c%c%c%c)\n", - name, val, - val & 0x80 ? '1' : '0', - val & 0x40 ? '1' : '0', - val & 0x20 ? '1' : '0', - val & 0x10 ? '1' : '0', - val & 0x08 ? '1' : '0', - val & 0x04 ? '1' : '0', - val & 0x02 ? '1' : '0', - val & 0x01 ? '1' : '0'); -} - /** * cirrusfb_dbg_print_regs * @base: If using newmmio, the newmmio base address, otherwise %NULL @@ -3014,9 +2888,9 @@ void cirrusfb_dbg_print_byte(const char *name, unsigned char val) * used at the given @base address to query the information. */ -static -void cirrusfb_dbg_print_regs(caddr_t regbase, - enum cirrusfb_dbg_reg_class reg_class, ...) +static void cirrusfb_dbg_print_regs(struct fb_info *info, + caddr_t regbase, + enum cirrusfb_dbg_reg_class reg_class, ...) { va_list list; unsigned char val = 0; @@ -3042,7 +2916,7 @@ void cirrusfb_dbg_print_regs(caddr_t regbase, break; } - cirrusfb_dbg_print_byte(name, val); + dev_dbg(info->device, "%8s = 0x%02X\n", name, val); name = va_arg(list, char *); } @@ -3050,18 +2924,6 @@ void cirrusfb_dbg_print_regs(caddr_t regbase, va_end(list); } -/** - * cirrusfb_dump - * @cirrusfbinfo: - * - * DESCRIPTION: - */ - -static void cirrusfb_dump(void) -{ - cirrusfb_dbg_reg_dump(NULL); -} - /** * cirrusfb_dbg_reg_dump * @base: If using newmmio, the newmmio base address, otherwise %NULL @@ -3072,12 +2934,11 @@ static void cirrusfb_dump(void) * used at the given @base address to query the information. */ -static -void cirrusfb_dbg_reg_dump(caddr_t regbase) +static void cirrusfb_dbg_reg_dump(struct fb_info *info, caddr_t regbase) { - DPRINTK("CIRRUSFB VGA CRTC register dump:\n"); + dev_dbg(info->device, "VGA CRTC register dump:\n"); - cirrusfb_dbg_print_regs(regbase, CRT, + cirrusfb_dbg_print_regs(info, regbase, CRT, "CR00", 0x00, "CR01", 0x01, "CR02", 0x02, @@ -3127,11 +2988,11 @@ void cirrusfb_dbg_reg_dump(caddr_t regbase) "CR3F", 0x3F, NULL); - DPRINTK("\n"); + dev_dbg(info->device, "\n"); - DPRINTK("CIRRUSFB VGA SEQ register dump:\n"); + dev_dbg(info->device, "VGA SEQ register dump:\n"); - cirrusfb_dbg_print_regs(regbase, SEQ, + cirrusfb_dbg_print_regs(info, regbase, SEQ, "SR00", 0x00, "SR01", 0x01, "SR02", 0x02, @@ -3160,7 +3021,7 @@ void cirrusfb_dbg_reg_dump(caddr_t regbase) "SR1F", 0x1F, NULL); - DPRINTK("\n"); + dev_dbg(info->device, "\n"); } #endif /* CIRRUSFB_DEBUG */ From 55a4ea6ab0fff0c02f101a60d2ba4f1794990499 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:04 -0700 Subject: [PATCH 418/478] cirrusfb: fix Laguna chipset memory detection and clock setting Fix memory detection and clock setting for Cirrus Laguna chipsets (GD5464/GD5465). The changes are done after the Xorg code. The driver still does not display anything on the GD5465 but it switches resolutions correctly at least. Signed-off-by: Krzysztof Helt Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 65 +++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 3b4b0f1e0615..dd09bae910f0 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -867,19 +867,24 @@ static int cirrusfb_set_par_foo(struct fb_info *info) } } if (nom) { - vga_wseq(regbase, CL_SEQRB, nom); tmp = den << 1; if (div != 0) tmp |= 1; - /* 6 bit denom; ONLY 5434!!! (bugged me 10 days) */ if ((cinfo->btype == BT_SD64) || (cinfo->btype == BT_ALPINE) || (cinfo->btype == BT_GD5480)) tmp |= 0x80; - dev_dbg(info->device, "CL_SEQR1B: %ld\n", (long) tmp); - vga_wseq(regbase, CL_SEQR1B, tmp); + dev_dbg(info->device, "CL_SEQR1B: %d\n", (int) tmp); + /* Laguna chipset has reversed clock registers */ + if (cinfo->btype == BT_LAGUNA) { + vga_wseq(regbase, CL_SEQRE, tmp); + vga_wseq(regbase, CL_SEQR1E, nom); + } else { + vga_wseq(regbase, CL_SEQRB, nom); + vga_wseq(regbase, CL_SEQR1B, tmp); + } } if (yres >= 1024) @@ -1917,31 +1922,37 @@ static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info, u8 __iomem *regbase) { unsigned long mem; - unsigned char SRF; + struct cirrusfb_info *cinfo = info->par; - SRF = vga_rseq(regbase, CL_SEQRF); - switch ((SRF & 0x18)) { - case 0x08: - mem = 512 * 1024; - break; - case 0x10: - mem = 1024 * 1024; - break; - /* 64-bit DRAM data bus width; assume 2MB. Also indicates 2MB memory - * on the 5430. - */ - case 0x18: - mem = 2048 * 1024; - break; - default: - dev_warn(info->device, "CLgenfb: Unknown memory size!\n"); - mem = 1024 * 1024; + if (cinfo->btype == BT_LAGUNA) { + unsigned char SR14 = vga_rseq(regbase, CL_SEQR14); + + mem = ((SR14 & 7) + 1) << 20; + } else { + unsigned char SRF = vga_rseq(regbase, CL_SEQRF); + switch ((SRF & 0x18)) { + case 0x08: + mem = 512 * 1024; + break; + case 0x10: + mem = 1024 * 1024; + break; + /* 64-bit DRAM data bus width; assume 2MB. + * Also indicates 2MB memory on the 5430. + */ + case 0x18: + mem = 2048 * 1024; + break; + default: + dev_warn(info->device, "Unknown memory size!\n"); + mem = 1024 * 1024; + } + /* If DRAM bank switching is enabled, there must be + * twice as much memory installed. (4MB on the 5434) + */ + if (SRF & 0x80) + mem *= 2; } - if (SRF & 0x80) - /* If DRAM bank switching is enabled, there must be twice as much - * memory installed. (4MB on the 5434) - */ - mem *= 2; /* TODO: Handling of GD5446/5480 (see XF86 sources ...) */ return mem; From 213d4bdd8cd405d9ba59ee78165b8c870f83a018 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:04 -0700 Subject: [PATCH 419/478] cirrusfb: add Laguna additional overflow register Add additional overflow register setting for Laguna chips. Also, simplify some code in the cirrusfb_pan_display() and cirrusfb_blank(). Signed-off-by: Krzysztof Helt Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 68 +++++++++++++++++++++++++--------------- include/video/cirrus.h | 1 + 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index dd09bae910f0..119e49ed6218 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -1259,13 +1259,32 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* screen start addr #16-18, fastpagemode cycles */ vga_wcrt(regbase, CL_CRT1B, tmp); - if (cinfo->btype == BT_SD64 || - cinfo->btype == BT_PICASSO4 || - cinfo->btype == BT_ALPINE || - cinfo->btype == BT_GD5480) - /* screen start address bit 19 */ + /* screen start address bit 19 */ + if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) vga_wcrt(regbase, CL_CRT1D, 0x00); + if (cinfo->btype == BT_LAGUNA || + cinfo->btype == BT_GD5480) { + + tmp = 0; + if ((htotal + 5) & 256) + tmp |= 128; + if (hdispend & 256) + tmp |= 64; + if (hsyncstart & 256) + tmp |= 48; + if (vtotal & 1024) + tmp |= 8; + if (vdispend & 1024) + tmp |= 4; + if (vsyncstart & 1024) + tmp |= 3; + + vga_wcrt(regbase, CL_CRT1E, tmp); + dev_dbg(info->device, "CRT1e: %d\n", tmp); + } + + /* text cursor location high */ vga_wcrt(regbase, VGA_CRTC_CURSOR_HI, 0); /* text cursor location low */ @@ -1383,7 +1402,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, int xoffset = 0; int yoffset = 0; unsigned long base; - unsigned char tmp = 0, tmp2 = 0, xpix; + unsigned char tmp, xpix; struct cirrusfb_info *cinfo = info->par; dev_dbg(info->device, @@ -1418,6 +1437,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI, (unsigned char) (base >> 8)); + /* 0xf2 is %11110010, exclude tmp bits */ + tmp = vga_rcrt(cinfo->regbase, CL_CRT1B) & 0xf2; /* construct bits 16, 17 and 18 of screen start address */ if (base & 0x10000) tmp |= 0x01; @@ -1426,9 +1447,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, if (base & 0x40000) tmp |= 0x08; - /* 0xf2 is %11110010, exclude tmp bits */ - tmp2 = (vga_rcrt(cinfo->regbase, CL_CRT1B) & 0xf2) | tmp; - vga_wcrt(cinfo->regbase, CL_CRT1B, tmp2); + vga_wcrt(cinfo->regbase, CL_CRT1B, tmp); /* construct bit 19 of screen start address */ if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) @@ -1473,47 +1492,44 @@ static int cirrusfb_blank(int blank_mode, struct fb_info *info) /* Undo current */ if (current_mode == FB_BLANK_NORMAL || - current_mode == FB_BLANK_UNBLANK) { - /* unblank the screen */ - val = vga_rseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE); + current_mode == FB_BLANK_UNBLANK) /* clear "FullBandwidth" bit */ - vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, val & 0xdf); - /* and undo VESA suspend trickery */ - vga_wgfx(cinfo->regbase, CL_GRE, 0x00); - } - - /* set new */ - if (blank_mode > FB_BLANK_NORMAL) { - /* blank the screen */ - val = vga_rseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE); + val = 0; + else /* set "FullBandwidth" bit */ - vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, val | 0x20); - } + val = 0x20; + + val |= vga_rseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE) & 0xdf; + vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, val); switch (blank_mode) { case FB_BLANK_UNBLANK: case FB_BLANK_NORMAL: + val = 0x00; break; case FB_BLANK_VSYNC_SUSPEND: - vga_wgfx(cinfo->regbase, CL_GRE, 0x04); + val = 0x04; break; case FB_BLANK_HSYNC_SUSPEND: - vga_wgfx(cinfo->regbase, CL_GRE, 0x02); + val = 0x02; break; case FB_BLANK_POWERDOWN: - vga_wgfx(cinfo->regbase, CL_GRE, 0x06); + val = 0x06; break; default: dev_dbg(info->device, "EXIT, returning 1\n"); return 1; } + vga_wgfx(cinfo->regbase, CL_GRE, val); + cinfo->blank_mode = blank_mode; dev_dbg(info->device, "EXIT, returning 0\n"); /* Let fbcon do a soft blank for us */ return (blank_mode == FB_BLANK_NORMAL) ? 1 : 0; } + /**** END Hardware specific Routines **************************************/ /****************************************************************************/ /**** BEGIN Internal Routines ***********************************************/ diff --git a/include/video/cirrus.h b/include/video/cirrus.h index b2776b6c8679..a1b7985b6b27 100644 --- a/include/video/cirrus.h +++ b/include/video/cirrus.h @@ -71,6 +71,7 @@ #define CL_CRT1B 0x1b /* Extended Display Controls */ #define CL_CRT1C 0x1c /* Sync adjust and genlock register */ #define CL_CRT1D 0x1d /* Overlay Extended Control register */ +#define CL_CRT1E 0x1e /* Another overflow register */ #define CL_CRT25 0x25 /* Part Status Register */ #define CL_CRT27 0x27 /* ID Register */ #define CL_CRT51 0x51 /* P4 disable "flicker fixer" */ From 6e30fc086d000d15abfe5550cc8b286335f7e132 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:05 -0700 Subject: [PATCH 420/478] cirrusfb: add mmio registers for Laguna chipsets The Laguna chipsets use special registers which are available through the mmio area. The cirrusfb driver does not use memory mapped registers for the PCI cards. Add the memory mapped area for Laguna chipsets and add basic usage of the special Laguna registers after SVGALIB code. This gives readable console at 16bpp on the GD-5465 (Laguna AGP). The 8bpp and 32bpp depths are still broken. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 119e49ed6218..378d60e01902 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -327,6 +327,7 @@ enum cirrusfb_dbg_reg_class { /* info about board */ struct cirrusfb_info { u8 __iomem *regbase; + u8 __iomem *laguna_mmio; enum cirrus_board btype; unsigned char SFR; /* Shadow of special function register */ @@ -699,6 +700,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) int yres, vdispend, vsyncstart, vsyncend, vtotal; long freq; int nom, den, div; + unsigned int control, format, threshold; dev_dbg(info->device, "Requested mode: %dx%dx%d\n", var->xres, var->yres, var->bits_per_pixel); @@ -866,6 +868,23 @@ static int cirrusfb_set_par_foo(struct fb_info *info) cirrusfb_set_mclk_as_source(info, divMCLK); } } + if (cinfo->btype == BT_LAGUNA) { + long pcifc = fb_readl(cinfo->laguna_mmio + 0x3fc); + unsigned char tile = fb_readb(cinfo->laguna_mmio + 0x407); + unsigned short tile_control; + + tile_control = fb_readw(cinfo->laguna_mmio + 0x2c4); + fb_writew(tile_control & ~0x80, cinfo->laguna_mmio + 0x2c4); + + fb_writel(pcifc | 0x10000000l, cinfo->laguna_mmio + 0x3fc); + fb_writeb(tile & 0x3f, cinfo->laguna_mmio + 0x407); + control = fb_readw(cinfo->laguna_mmio + 0x402); + threshold = fb_readw(cinfo->laguna_mmio + 0xea); + control &= ~0x6800; + format = 0; + threshold &= 0xffe0; + threshold &= 0x3fbf; + } if (nom) { tmp = den << 1; if (div != 0) @@ -1035,6 +1054,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_LAGUNA: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) | 0x01); + threshold |= 0x10; break; default: @@ -1146,6 +1166,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_LAGUNA: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); + control |= 0x2000; + format |= 0x1400; + threshold |= 0x10; break; default: @@ -1220,6 +1243,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_LAGUNA: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); + control |= 0x6000; + format |= 0x3400; + threshold |= 0x20; break; default: @@ -1327,6 +1353,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* graphics cursor attributes: nothing special */ vga_wseq(regbase, CL_SEQR12, 0x0); + if (cinfo->btype == BT_LAGUNA) { + /* no tiles */ + fb_writew(control | 0x1000, cinfo->laguna_mmio + 0x402); + fb_writew(format, cinfo->laguna_mmio + 0xc0); + fb_writew(threshold, cinfo->laguna_mmio + 0xea); + } /* finally, turn on everything - turn off "FullBandwidth" bit */ /* also, set "DotClock%2" bit where requested */ tmp = 0x01; @@ -2000,7 +2032,10 @@ static void get_pci_addrs(const struct pci_dev *pdev, static void cirrusfb_pci_unmap(struct fb_info *info) { struct pci_dev *pdev = to_pci_dev(info->device); + struct cirrusfb_info *cinfo = info->par; + if (cinfo->laguna_mmio == NULL) + iounmap(cinfo->laguna_mmio); iounmap(info->screen_base); #if 0 /* if system didn't claim this region, we would... */ release_mem_region(0xA0000, 65535); @@ -2180,6 +2215,7 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, get_pci_addrs(pdev, &board_addr, &info->fix.mmio_start); /* FIXME: this forces VGA. alternatives? */ cinfo->regbase = NULL; + cinfo->laguna_mmio = ioremap(info->fix.mmio_start, 0x1000); } dev_dbg(info->device, "Board address: 0x%lx, register address: 0x%lx\n", @@ -2234,6 +2270,8 @@ err_release_regions: #endif pci_release_regions(pdev); err_release_fb: + if (cinfo->laguna_mmio == NULL) + iounmap(cinfo->laguna_mmio); framebuffer_release(info); err_disable: err_out: From 6683e01e2c950f635a6c0e2bbc80db1b1838311f Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:06 -0700 Subject: [PATCH 421/478] cirrusfb: do not calculate line length twice A line length is calculated twice: first in the cirrusfb_decode_var() then in the cirrusfb_set_par_foo(). Use the first calculated value. A nice side effect is that 32bpp mode works now. Signed-off-by: Krzysztof Helt Cc: Geert Uytterhoeven Cc: Arthur Marsh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 378d60e01902..53572c0f3474 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -694,7 +694,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) struct cirrusfb_regs regs; u8 __iomem *regbase = cinfo->regbase; unsigned char tmp; - int offset = 0, err; + int err; + int pitch; const struct cirrusfb_board_info_rec *bi; int hdispend, hsyncstart, hsyncend, htotal; int yres, vdispend, vsyncstart, vsyncend, vtotal; @@ -1027,7 +1028,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x06); /* plane mask: only write to first plane */ vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0x01); - offset = var->xres_virtual / 16; } /****************************************************** @@ -1113,7 +1113,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a); /* plane mask: enable writing to all 4 planes */ vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff); - offset = var->xres_virtual / 8; } /****************************************************** @@ -1190,7 +1189,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a); /* plane mask: enable writing to all 4 planes */ vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff); - offset = var->xres_virtual / 4; } /****************************************************** @@ -1263,7 +1261,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a); /* plane mask: enable writing to all 4 planes */ vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff); - offset = var->xres_virtual / 4; } /****************************************************** @@ -1277,9 +1274,10 @@ static int cirrusfb_set_par_foo(struct fb_info *info) "What's this? requested color depth == %d.\n", var->bits_per_pixel); - vga_wcrt(regbase, VGA_CRTC_OFFSET, offset & 0xff); + pitch = info->fix.line_length >> 3; + vga_wcrt(regbase, VGA_CRTC_OFFSET, pitch & 0xff); tmp = 0x22; - if (offset & 0x100) + if (pitch & 0x100) tmp |= 0x10; /* offset overflow bit */ /* screen start addr #16-18, fastpagemode cycles */ @@ -1287,7 +1285,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* screen start address bit 19 */ if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) - vga_wcrt(regbase, CL_CRT1D, 0x00); + vga_wcrt(regbase, CL_CRT1D, (pitch >> 9) & 1); if (cinfo->btype == BT_LAGUNA || cinfo->btype == BT_GD5480) { From c4dec3962d6bff26010fcfc61500c1241469a6e0 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:07 -0700 Subject: [PATCH 422/478] cirrusfb: use 5-6-5 RGB for 16bpp mode Use the 5-6-5 RGB mode instead of the 5-5-5 mode at 16bpp depth. It fixes colors in the 16bpp modes on Cirrus Laguna chips. Signed-off-by: Krzysztof Helt Cc: Geert Uytterhoeven Cc: Arthur Marsh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 53572c0f3474..d844c41e2010 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -499,12 +499,12 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, var->green.offset = -3; var->blue.offset = 8; } else { - var->red.offset = 10; + var->red.offset = 11; var->green.offset = 5; var->blue.offset = 0; } var->red.length = 5; - var->green.length = 5; + var->green.length = 6; var->blue.length = 5; break; @@ -1180,7 +1180,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* pixel mask: pass-through all planes */ WGen(cinfo, VGA_PEL_MSK, 0xff); #ifdef CONFIG_PCI - WHDR(cinfo, 0xc0); /* Copy Xbh */ + WHDR(cinfo, 0xc1); /* Copy Xbh */ #elif defined(CONFIG_ZORRO) /* FIXME: CONFIG_PCI and CONFIG_ZORRO may be defined both */ WHDR(cinfo, 0xa0); /* hidden dac reg: nothing special */ From 48c329e906f834711906ab4b0986ea0e857aff16 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:08 -0700 Subject: [PATCH 423/478] cirrusfb: various improvements Various improvements to the code: - kill a structure with only one field: multiplexing and use the field directly - move the cirrusfb_ops structure down the file to kill forward declarations - move cirrusfb_init() to kill forward declaration - kill register loads done already in the init_vgachip() - kill assigments done by higher layer in the cirrusfb_pan_display() - do not overwrite line pitch bit in the CL_CRT1D register - kill btype variables if they were used only once or twice - add cpu_relax() in the busy waiting loop The fix to the CL_CRT1D register handling makess the 1024x768 32bpp mode work. Previously, only lower resolution modes have worked with 32bpp. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 202 +++++++++++---------------------------- 1 file changed, 58 insertions(+), 144 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index d844c41e2010..9e52db7d3331 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -313,10 +313,6 @@ static const struct { }; #endif /* CONFIG_ZORRO */ -struct cirrusfb_regs { - int multiplexing; -}; - #ifdef CIRRUSFB_DEBUG enum cirrusfb_dbg_reg_class { CRT, @@ -331,7 +327,7 @@ struct cirrusfb_info { enum cirrus_board btype; unsigned char SFR; /* Shadow of special function register */ - struct cirrusfb_regs currentmode; + int multiplexing; int blank_mode; u32 pseudo_palette[16]; @@ -345,43 +341,8 @@ static char *mode_option __devinitdata = "640x480@60"; /**** BEGIN PROTOTYPES ******************************************************/ /*--- Interface used by the world ------------------------------------------*/ -static int cirrusfb_init(void); -#ifndef MODULE -static int cirrusfb_setup(char *options); -#endif - -static int cirrusfb_open(struct fb_info *info, int user); -static int cirrusfb_release(struct fb_info *info, int user); -static int cirrusfb_setcolreg(unsigned regno, unsigned red, unsigned green, - unsigned blue, unsigned transp, - struct fb_info *info); -static int cirrusfb_check_var(struct fb_var_screeninfo *var, - struct fb_info *info); -static int cirrusfb_set_par(struct fb_info *info); static int cirrusfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info); -static int cirrusfb_blank(int blank_mode, struct fb_info *info); -static void cirrusfb_fillrect(struct fb_info *info, - const struct fb_fillrect *region); -static void cirrusfb_copyarea(struct fb_info *info, - const struct fb_copyarea *area); -static void cirrusfb_imageblit(struct fb_info *info, - const struct fb_image *image); - -/* function table of the above functions */ -static struct fb_ops cirrusfb_ops = { - .owner = THIS_MODULE, - .fb_open = cirrusfb_open, - .fb_release = cirrusfb_release, - .fb_setcolreg = cirrusfb_setcolreg, - .fb_check_var = cirrusfb_check_var, - .fb_set_par = cirrusfb_set_par, - .fb_pan_display = cirrusfb_pan_display, - .fb_blank = cirrusfb_blank, - .fb_fillrect = cirrusfb_fillrect, - .fb_copyarea = cirrusfb_copyarea, - .fb_imageblit = cirrusfb_imageblit, -}; /*--- Internal routines ----------------------------------------------------*/ static void init_vgachip(struct fb_info *info); @@ -587,7 +548,6 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, } static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, - struct cirrusfb_regs *regs, struct fb_info *info) { long freq; @@ -628,7 +588,7 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq); maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx]; - regs->multiplexing = 0; + cinfo->multiplexing = 0; /* If the frequency is greater than we can support, we might be able * to use multiplexing for the video mode */ @@ -636,7 +596,7 @@ static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, switch (cinfo->btype) { case BT_ALPINE: case BT_GD5480: - regs->multiplexing = 1; + cinfo->multiplexing = 1; break; default: @@ -691,7 +651,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) { struct cirrusfb_info *cinfo = info->par; struct fb_var_screeninfo *var = &info->var; - struct cirrusfb_regs regs; u8 __iomem *regbase = cinfo->regbase; unsigned char tmp; int err; @@ -709,7 +668,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) init_vgachip(info); - err = cirrusfb_decode_var(var, ®s, info); + err = cirrusfb_decode_var(var, info); if (err) { /* should never happen */ dev_dbg(info->device, "mode change aborted. invalid var.\n"); @@ -753,7 +712,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vsyncend /= 2; vdispend /= 2; } - if (regs.multiplexing) { + if (cinfo->multiplexing) { htotal /= 2; hsyncstart /= 2; hsyncend /= 2; @@ -964,7 +923,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_ALPINE: case BT_GD5480: vga_wseq(regbase, CL_SEQR7, - regs.multiplexing ? + cinfo->multiplexing ? bi->sr07_1bpp_mux : bi->sr07_1bpp); break; @@ -1018,7 +977,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* pixel mask: pass-through for first plane */ WGen(cinfo, VGA_PEL_MSK, 0x01); - if (regs.multiplexing) + if (cinfo->multiplexing) /* hidden dac reg: 1280x1024 */ WHDR(cinfo, 0x4a); else @@ -1047,7 +1006,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_ALPINE: case BT_GD5480: vga_wseq(regbase, CL_SEQR7, - regs.multiplexing ? + cinfo->multiplexing ? bi->sr07_8bpp_mux : bi->sr07_8bpp); break; @@ -1101,18 +1060,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* mode register: 256 color mode */ vga_wgfx(regbase, VGA_GFX_MODE, 64); - /* pixel mask: pass-through all planes */ - WGen(cinfo, VGA_PEL_MSK, 0xff); - if (regs.multiplexing) + if (cinfo->multiplexing) /* hidden dac reg: 1280x1024 */ WHDR(cinfo, 0x4a); else /* hidden dac: nothing */ WHDR(cinfo, 0); - /* memory mode: chain4, ext. memory */ - vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a); - /* plane mask: enable writing to all 4 planes */ - vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff); } /****************************************************** @@ -1177,18 +1130,12 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* mode register: 256 color mode */ vga_wgfx(regbase, VGA_GFX_MODE, 64); - /* pixel mask: pass-through all planes */ - WGen(cinfo, VGA_PEL_MSK, 0xff); #ifdef CONFIG_PCI WHDR(cinfo, 0xc1); /* Copy Xbh */ #elif defined(CONFIG_ZORRO) /* FIXME: CONFIG_PCI and CONFIG_ZORRO may be defined both */ WHDR(cinfo, 0xa0); /* hidden dac reg: nothing special */ #endif - /* memory mode: chain4, ext. memory */ - vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a); - /* plane mask: enable writing to all 4 planes */ - vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff); } /****************************************************** @@ -1253,14 +1200,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* mode register: 256 color mode */ vga_wgfx(regbase, VGA_GFX_MODE, 64); - /* pixel mask: pass-through all planes */ - WGen(cinfo, VGA_PEL_MSK, 0xff); /* hidden dac reg: 8-8-8 mode (24 or 32) */ WHDR(cinfo, 0xc5); - /* memory mode: chain4, ext. memory */ - vga_wseq(regbase, VGA_SEQ_MEMORY_MODE, 0x0a); - /* plane mask: enable writing to all 4 planes */ - vga_wseq(regbase, VGA_SEQ_PLANE_WRITE, 0xff); } /****************************************************** @@ -1309,48 +1250,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info) } - /* text cursor location high */ - vga_wcrt(regbase, VGA_CRTC_CURSOR_HI, 0); - /* text cursor location low */ - vga_wcrt(regbase, VGA_CRTC_CURSOR_LO, 0); - /* underline row scanline = at very bottom */ - vga_wcrt(regbase, VGA_CRTC_UNDERLINE, 0); - - /* controller mode */ - vga_wattr(regbase, VGA_ATC_MODE, 1); - /* overscan (border) color */ - vga_wattr(regbase, VGA_ATC_OVERSCAN, 0); - /* color plane enable */ - vga_wattr(regbase, VGA_ATC_PLANE_ENABLE, 15); /* pixel panning */ vga_wattr(regbase, CL_AR33, 0); - /* color select */ - vga_wattr(regbase, VGA_ATC_COLOR_PAGE, 0); /* [ EGS: SetOffset(); ] */ /* From SetOffset(): Turn on VideoEnable bit in Attribute controller */ AttrOn(cinfo); - /* set/reset register */ - vga_wgfx(regbase, VGA_GFX_SR_VALUE, 0); - /* set/reset enable */ - vga_wgfx(regbase, VGA_GFX_SR_ENABLE, 0); - /* color compare */ - vga_wgfx(regbase, VGA_GFX_COMPARE_VALUE, 0); - /* data rotate */ - vga_wgfx(regbase, VGA_GFX_DATA_ROTATE, 0); - /* read map select */ - vga_wgfx(regbase, VGA_GFX_PLANE_READ, 0); - /* miscellaneous register */ - vga_wgfx(regbase, VGA_GFX_MISC, 1); - /* color don't care */ - vga_wgfx(regbase, VGA_GFX_COMPARE_MASK, 15); - /* bit mask */ - vga_wgfx(regbase, VGA_GFX_BIT_MASK, 255); - - /* graphics cursor attributes: nothing special */ - vga_wseq(regbase, CL_SEQR12, 0x0); - if (cinfo->btype == BT_LAGUNA) { /* no tiles */ fb_writew(control | 0x1000, cinfo->laguna_mmio + 0x402); @@ -1369,8 +1275,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wseq(regbase, VGA_SEQ_CLOCK_MODE, tmp); dev_dbg(info->device, "CL_SEQR1: %d\n", tmp); - cinfo->currentmode = regs; - /* pan to requested offset */ cirrusfb_pan_display(var, info); @@ -1443,9 +1347,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, if (var->vmode & FB_VMODE_YWRAP) return -EINVAL; - info->var.xoffset = var->xoffset; - info->var.yoffset = var->yoffset; - xoffset = var->xoffset * info->var.bits_per_pixel / 8; yoffset = var->yoffset; @@ -1480,8 +1381,11 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, vga_wcrt(cinfo->regbase, CL_CRT1B, tmp); /* construct bit 19 of screen start address */ - if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) - vga_wcrt(cinfo->regbase, CL_CRT1D, (base >> 12) & 0x80); + if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) { + tmp = vga_rcrt(cinfo->regbase, CL_CRT1D) & ~0x80; + tmp |= (base >> 12) & 0x80; + vga_wcrt(cinfo->regbase, CL_CRT1D, tmp); + } /* write pixel panning value to AR33; this does not quite work in 8bpp * @@ -1670,8 +1574,8 @@ static void init_vgachip(struct fb_info *info) vga_wseq(cinfo->regbase, VGA_SEQ_PLANE_WRITE, 0xff); /* character map select: doesn't even matter in gx mode */ vga_wseq(cinfo->regbase, VGA_SEQ_CHARACTER_MAP, 0x00); - /* memory mode: chain-4, no odd/even, ext. memory */ - vga_wseq(cinfo->regbase, VGA_SEQ_MEMORY_MODE, 0x0e); + /* memory mode: chain4, ext. memory */ + vga_wseq(cinfo->regbase, VGA_SEQ_MEMORY_MODE, 0x0a); /* controller-internal base address of video memory */ if (bi->init_sr07) @@ -1784,7 +1688,6 @@ static void init_vgachip(struct fb_info *info) vga_wattr(cinfo->regbase, VGA_ATC_OVERSCAN, 0x00); /* Color Plane enable: Enable all 4 planes */ vga_wattr(cinfo->regbase, VGA_ATC_PLANE_ENABLE, 0x0f); -/* ### vga_wattr(cinfo->regbase, CL_AR33, 0x00); * Pixel Panning: - */ /* Color Select: - */ vga_wattr(cinfo->regbase, VGA_ATC_COLOR_PAGE, 0x00); @@ -2063,6 +1966,21 @@ static void cirrusfb_zorro_unmap(struct fb_info *info) } #endif /* CONFIG_ZORRO */ +/* function table of the above functions */ +static struct fb_ops cirrusfb_ops = { + .owner = THIS_MODULE, + .fb_open = cirrusfb_open, + .fb_release = cirrusfb_release, + .fb_setcolreg = cirrusfb_setcolreg, + .fb_check_var = cirrusfb_check_var, + .fb_set_par = cirrusfb_set_par, + .fb_pan_display = cirrusfb_pan_display, + .fb_blank = cirrusfb_blank, + .fb_fillrect = cirrusfb_fillrect, + .fb_copyarea = cirrusfb_copyarea, + .fb_imageblit = cirrusfb_imageblit, +}; + static int __devinit cirrusfb_set_fbinfo(struct fb_info *info) { struct cirrusfb_info *cinfo = info->par; @@ -2111,12 +2029,9 @@ static int __devinit cirrusfb_register(struct fb_info *info) { struct cirrusfb_info *cinfo = info->par; int err; - enum cirrus_board btype; - - btype = cinfo->btype; /* sanity checks */ - assert(btype != BT_NONE); + assert(cinfo->btype != BT_NONE); /* set all the vital stuff */ cirrusfb_set_fbinfo(info); @@ -2132,7 +2047,7 @@ static int __devinit cirrusfb_register(struct fb_info *info) info->var.activate = FB_ACTIVATE_NOW; - err = cirrusfb_decode_var(&info->var, &cinfo->currentmode, info); + err = cirrusfb_decode_var(&info->var, info); if (err < 0) { /* should never happen */ dev_dbg(info->device, @@ -2174,7 +2089,6 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, { struct cirrusfb_info *cinfo; struct fb_info *info; - enum cirrus_board btype; unsigned long board_addr, board_size; int ret; @@ -2192,11 +2106,11 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, } cinfo = info->par; - cinfo->btype = btype = (enum cirrus_board) ent->driver_data; + cinfo->btype = (enum cirrus_board) ent->driver_data; dev_dbg(info->device, " Found PCI device, base address 0 is 0x%Lx, btype set to %d\n", - (unsigned long long)pdev->resource[0].start, btype); + (unsigned long long)pdev->resource[0].start, cinfo->btype); dev_dbg(info->device, " base address 1 is 0x%Lx\n", (unsigned long long)pdev->resource[1].start); @@ -2219,7 +2133,7 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, dev_dbg(info->device, "Board address: 0x%lx, register address: 0x%lx\n", board_addr, info->fix.mmio_start); - board_size = (btype == BT_GD5480) ? + board_size = (cinfo->btype == BT_GD5480) ? 32 * MB_ : cirrusfb_get_memsize(info, cinfo->regbase); ret = pci_request_regions(pdev, "cirrusfb"); @@ -2425,27 +2339,6 @@ static struct zorro_driver cirrusfb_zorro_driver = { }; #endif /* CONFIG_ZORRO */ -static int __init cirrusfb_init(void) -{ - int error = 0; - -#ifndef MODULE - char *option = NULL; - - if (fb_get_options("cirrusfb", &option)) - return -ENODEV; - cirrusfb_setup(option); -#endif - -#ifdef CONFIG_ZORRO - error |= zorro_register_driver(&cirrusfb_zorro_driver); -#endif -#ifdef CONFIG_PCI - error |= pci_register_driver(&cirrusfb_pci_driver); -#endif - return error; -} - #ifndef MODULE static int __init cirrusfb_setup(char *options) { @@ -2477,6 +2370,27 @@ MODULE_AUTHOR("Copyright 1999,2000 Jeff Garzik "); MODULE_DESCRIPTION("Accelerated FBDev driver for Cirrus Logic chips"); MODULE_LICENSE("GPL"); +static int __init cirrusfb_init(void) +{ + int error = 0; + +#ifndef MODULE + char *option = NULL; + + if (fb_get_options("cirrusfb", &option)) + return -ENODEV; + cirrusfb_setup(option); +#endif + +#ifdef CONFIG_ZORRO + error |= zorro_register_driver(&cirrusfb_zorro_driver); +#endif +#ifdef CONFIG_PCI + error |= pci_register_driver(&cirrusfb_pci_driver); +#endif + return error; +} + static void __exit cirrusfb_exit(void) { #ifdef CONFIG_PCI @@ -2683,7 +2597,7 @@ static void cirrusfb_WaitBLT(u8 __iomem *regbase) { /* now busy-wait until we're done */ while (vga_rgfx(regbase, CL_GR31) & 0x08) - /* do nothing */ ; + cpu_relax(); } /******************************************************************* From 1b48cb563d59e03dbf530174f30c0ed3b6fba513 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:08 -0700 Subject: [PATCH 424/478] cirrusfb: Laguna chipset 8bpp fix Fix 8bpp mode by adding handling of the Laguna chipsets to various places and stop trashing a HDR register which probably does not exist on the Laguna. Fix compilation warnings about uninitialized variables also. Finally, all 8bpp, 16bpp and 32bpp modes work on the Laguna chipset. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 34 +++++++++++++++++++++------------- include/video/cirrus.h | 1 - 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 9e52db7d3331..12f45520e5d3 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -660,7 +660,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) int yres, vdispend, vsyncstart, vsyncend, vtotal; long freq; int nom, den, div; - unsigned int control, format, threshold; + unsigned int control = 0, format = 0, threshold = 0; dev_dbg(info->device, "Requested mode: %dx%dx%d\n", var->xres, var->yres, var->bits_per_pixel); @@ -842,8 +842,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) threshold = fb_readw(cinfo->laguna_mmio + 0xea); control &= ~0x6800; format = 0; - threshold &= 0xffe0; - threshold &= 0x3fbf; + threshold &= 0xffe0 & 0x3fbf; } if (nom) { tmp = den << 1; @@ -893,6 +892,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) tmp |= 0x40; if (var->sync & FB_SYNC_VERT_HIGH_ACT) tmp |= 0x80; + if (cinfo->btype == BT_LAGUNA) + tmp |= 0xc; WGen(cinfo, VGA_MIS_W, tmp); /* Screen A Preset Row-Scan register */ @@ -1228,9 +1229,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) vga_wcrt(regbase, CL_CRT1D, (pitch >> 9) & 1); - if (cinfo->btype == BT_LAGUNA || - cinfo->btype == BT_GD5480) { - + if (cinfo->btype == BT_LAGUNA) { tmp = 0; if ((htotal + 5) & 256) tmp |= 128; @@ -1360,7 +1359,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, xpix = (unsigned char) ((xoffset % 4) * 2); } - cirrusfb_WaitBLT(cinfo->regbase); /* make sure all the BLT's are done */ + if (cinfo->btype != BT_LAGUNA) + cirrusfb_WaitBLT(cinfo->regbase); /* lower 8 + 8 bits of screen start address */ vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO, @@ -1394,7 +1394,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, if (info->var.bits_per_pixel == 1) vga_wattr(cinfo->regbase, CL_AR33, xpix); - cirrusfb_WaitBLT(cinfo->regbase); + if (cinfo->btype != BT_LAGUNA) + cirrusfb_WaitBLT(cinfo->regbase); return 0; } @@ -1513,6 +1514,7 @@ static void init_vgachip(struct fb_info *info) vga_wgfx(cinfo->regbase, CL_GR2F, 0x00); break; + case BT_LAGUNA: case BT_ALPINE: /* Nothing to do to reset the board. */ break; @@ -1538,7 +1540,7 @@ static void init_vgachip(struct fb_info *info) WGen(cinfo, CL_VSSM2, 0x01); /* reset sequencer logic */ - vga_wseq(cinfo->regbase, CL_SEQR0, 0x03); + vga_wseq(cinfo->regbase, VGA_SEQ_RESET, 0x03); /* FullBandwidth (video off) and 8/9 dot clock */ vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, 0x21); @@ -1560,6 +1562,7 @@ static void init_vgachip(struct fb_info *info) vga_wseq(cinfo->regbase, CL_SEQRF, 0x98); break; case BT_ALPINE: + case BT_LAGUNA: break; case BT_SD64: vga_wseq(cinfo->regbase, CL_SEQRF, 0xb8); @@ -1648,7 +1651,8 @@ static void init_vgachip(struct fb_info *info) vga_wgfx(cinfo->regbase, VGA_GFX_COMPARE_MASK, 0x0f); /* Bit Mask: no mask at all */ vga_wgfx(cinfo->regbase, VGA_GFX_BIT_MASK, 0xff); - if (cinfo->btype == BT_ALPINE) + + if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_LAGUNA) /* (5434 can't have bit 3 set for bitblt) */ vga_wgfx(cinfo->regbase, CL_GRB, 0x20); else @@ -1845,7 +1849,8 @@ static void cirrusfb_imageblit(struct fb_info *info, { struct cirrusfb_info *cinfo = info->par; - cirrusfb_WaitBLT(cinfo->regbase); + if (cinfo->btype != BT_LAGUNA) + cirrusfb_WaitBLT(cinfo->regbase); cfb_imageblit(info, image); } @@ -1992,7 +1997,7 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info) | FBINFO_HWACCEL_YPAN | FBINFO_HWACCEL_FILLRECT | FBINFO_HWACCEL_COPYAREA; - if (noaccel) + if (noaccel || cinfo->btype == BT_LAGUNA) info->flags |= FBINFO_HWACCEL_DISABLED; info->fbops = &cirrusfb_ops; if (cinfo->btype == BT_GD5480) { @@ -2481,6 +2486,8 @@ static void WHDR(const struct cirrusfb_info *cinfo, unsigned char val) { unsigned char dummy; + if (cinfo->btype == BT_LAGUNA) + return; if (cinfo->btype == BT_PICASSO) { /* Klaus' hint for correct access to HDR on some boards */ /* first write 0 to pixel mask (3c6) */ @@ -2548,7 +2555,8 @@ static void WClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch vga_w(cinfo->regbase, VGA_PEL_IW, regnum); if (cinfo->btype == BT_PICASSO || cinfo->btype == BT_PICASSO4 || - cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480) { + cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480 || + cinfo->btype == BT_LAGUNA) { /* but DAC data register IS, at least for Picasso II */ if (cinfo->btype == BT_PICASSO) data += 0xfff; diff --git a/include/video/cirrus.h b/include/video/cirrus.h index a1b7985b6b27..9a5e9ee30782 100644 --- a/include/video/cirrus.h +++ b/include/video/cirrus.h @@ -32,7 +32,6 @@ #define CL_VSSM2 0x3c3 /* Motherboard Sleep */ /*** VGA Sequencer Registers ***/ -#define CL_SEQR0 0x0 /* Reset */ /* the following are from the "extension registers" group */ #define CL_SEQR6 0x6 /* Unlock ALL Extensions */ #define CL_SEQR7 0x7 /* Extended Sequencer Mode */ From 99a4584752bb41330342a427d014482525de7433 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:09 -0700 Subject: [PATCH 425/478] cirrusfb: check_var improvements Break cirrusfb_decode_var() function into two parts: cirrusfb_check_pixclock() which can be called from the cirrusfb_check_var() aand merge rest into the cirrusfb_set_par_foo(). This allows rejecting modes with too high pixclock before before any change to hardware state (and a console is messed up). Also, fix RGB field's lengths for 8bpp modes to correct ones so X11 works with fbdev driver with cirrusfb. Kill some redundant function calls or register loads. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 197 +++++++++++++++------------------------ 1 file changed, 75 insertions(+), 122 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 12f45520e5d3..bab713b63a0c 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -432,6 +432,53 @@ static int cirrusfb_check_mclk(struct fb_info *info, long freq) return 0; } +static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var, + struct fb_info *info) +{ + long freq; + long maxclock; + struct cirrusfb_info *cinfo = info->par; + unsigned maxclockidx = var->bits_per_pixel >> 3; + + /* convert from ps to kHz */ + freq = PICOS2KHZ(var->pixclock); + + dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq); + + maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx]; + cinfo->multiplexing = 0; + + /* If the frequency is greater than we can support, we might be able + * to use multiplexing for the video mode */ + if (freq > maxclock) { + switch (cinfo->btype) { + case BT_ALPINE: + case BT_GD5480: + cinfo->multiplexing = 1; + break; + + default: + dev_err(info->device, + "Frequency greater than maxclock (%ld kHz)\n", + maxclock); + return -EINVAL; + } + } +#if 0 + /* TODO: If we have a 1MB 5434, we need to put ourselves in a mode where + * the VCLK is double the pixel clock. */ + switch (var->bits_per_pixel) { + case 16: + case 32: + if (var->xres <= 800) + /* Xbh has this type of clock for 32-bit */ + freq /= 2; + break; + } +#endif + return 0; +} + static int cirrusfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { @@ -449,7 +496,7 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, case 8: var->red.offset = 0; - var->red.length = 6; + var->red.length = 8; var->green = var->red; var->blue = var->red; break; @@ -513,7 +560,6 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, return -EINVAL; } - if (var->xoffset < 0) var->xoffset = 0; if (var->yoffset < 0) @@ -544,80 +590,9 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, return -EINVAL; } - return 0; -} + if (cirrusfb_check_pixclock(var, info)) + return -EINVAL; -static int cirrusfb_decode_var(const struct fb_var_screeninfo *var, - struct fb_info *info) -{ - long freq; - long maxclock; - int maxclockidx = var->bits_per_pixel >> 3; - struct cirrusfb_info *cinfo = info->par; - - switch (var->bits_per_pixel) { - case 1: - info->fix.line_length = var->xres_virtual / 8; - info->fix.visual = FB_VISUAL_MONO10; - break; - - case 8: - info->fix.line_length = var->xres_virtual; - info->fix.visual = FB_VISUAL_PSEUDOCOLOR; - break; - - case 16: - case 32: - info->fix.line_length = var->xres_virtual * maxclockidx; - info->fix.visual = FB_VISUAL_TRUECOLOR; - break; - - default: - dev_dbg(info->device, - "Unsupported bpp size: %d\n", var->bits_per_pixel); - assert(false); - /* should never occur */ - break; - } - - info->fix.type = FB_TYPE_PACKED_PIXELS; - - /* convert from ps to kHz */ - freq = PICOS2KHZ(var->pixclock); - - dev_dbg(info->device, "desired pixclock: %ld kHz\n", freq); - - maxclock = cirrusfb_board_info[cinfo->btype].maxclock[maxclockidx]; - cinfo->multiplexing = 0; - - /* If the frequency is greater than we can support, we might be able - * to use multiplexing for the video mode */ - if (freq > maxclock) { - switch (cinfo->btype) { - case BT_ALPINE: - case BT_GD5480: - cinfo->multiplexing = 1; - break; - - default: - dev_err(info->device, - "Frequency greater than maxclock (%ld kHz)\n", - maxclock); - return -EINVAL; - } - } -#if 0 - /* TODO: If we have a 1MB 5434, we need to put ourselves in a mode where - * the VCLK is double the pixel clock. */ - switch (var->bits_per_pixel) { - case 16: - case 32: - if (var->xres <= 800) - /* Xbh has this type of clock for 32-bit */ - freq /= 2; - break; - } -#endif return 0; } @@ -653,7 +628,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) struct fb_var_screeninfo *var = &info->var; u8 __iomem *regbase = cinfo->regbase; unsigned char tmp; - int err; int pitch; const struct cirrusfb_board_info_rec *bi; int hdispend, hsyncstart, hsyncend, htotal; @@ -664,17 +638,29 @@ static int cirrusfb_set_par_foo(struct fb_info *info) dev_dbg(info->device, "Requested mode: %dx%dx%d\n", var->xres, var->yres, var->bits_per_pixel); - dev_dbg(info->device, "pixclock: %d\n", var->pixclock); + + switch (var->bits_per_pixel) { + case 1: + info->fix.line_length = var->xres_virtual / 8; + info->fix.visual = FB_VISUAL_MONO10; + break; + + case 8: + info->fix.line_length = var->xres_virtual; + info->fix.visual = FB_VISUAL_PSEUDOCOLOR; + break; + + case 16: + case 32: + info->fix.line_length = var->xres_virtual * + var->bits_per_pixel >> 3; + info->fix.visual = FB_VISUAL_TRUECOLOR; + break; + } + info->fix.type = FB_TYPE_PACKED_PIXELS; init_vgachip(info); - err = cirrusfb_decode_var(var, info); - if (err) { - /* should never happen */ - dev_dbg(info->device, "mode change aborted. invalid var.\n"); - return -EINVAL; - } - bi = &cirrusfb_board_info[cinfo->btype]; hsyncstart = var->xres + var->right_margin; @@ -873,9 +859,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) * address wrap, no compat. */ vga_wcrt(regbase, VGA_CRTC_MODE, 0xc3); -/* HAEH? vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, 0x20); - * previously: 0x00 unlock VGA_CRTC_H_TOTAL..CRT7 */ - /* don't know if it would hurt to also program this if no interlaced */ /* mode is used, but I feel better this way.. :-) */ if (var->vmode & FB_VMODE_INTERLACED) @@ -883,8 +866,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) else vga_wcrt(regbase, VGA_CRTC_REGS, 0x00); /* interlace control */ - vga_wseq(regbase, VGA_SEQ_CHARACTER_MAP, 0); - /* adjust horizontal/vertical sync type (low/high) */ /* enable display memory & CRTC I/O address for color mode */ tmp = 0x03; @@ -896,8 +877,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) tmp |= 0xc; WGen(cinfo, VGA_MIS_W, tmp); - /* Screen A Preset Row-Scan register */ - vga_wcrt(regbase, VGA_CRTC_PRESET_ROW, 0); /* text cursor on and start line */ vga_wcrt(regbase, VGA_CRTC_CURSOR_START, 0); /* text cursor end line */ @@ -1248,7 +1227,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) dev_dbg(info->device, "CRT1e: %d\n", tmp); } - /* pixel panning */ vga_wattr(regbase, CL_AR33, 0); @@ -1274,9 +1252,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wseq(regbase, VGA_SEQ_CLOCK_MODE, tmp); dev_dbg(info->device, "CL_SEQR1: %d\n", tmp); - /* pan to requested offset */ - cirrusfb_pan_display(var, info); - #ifdef CIRRUSFB_DEBUG cirrusfb_dbg_reg_dump(info, NULL); #endif @@ -1332,8 +1307,7 @@ static int cirrusfb_setcolreg(unsigned regno, unsigned red, unsigned green, static int cirrusfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info) { - int xoffset = 0; - int yoffset = 0; + int xoffset; unsigned long base; unsigned char tmp, xpix; struct cirrusfb_info *cinfo = info->par; @@ -1347,9 +1321,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, return -EINVAL; xoffset = var->xoffset * info->var.bits_per_pixel / 8; - yoffset = var->yoffset; - base = yoffset * info->fix.line_length + xoffset; + base = var->yoffset * info->fix.line_length + xoffset; if (info->var.bits_per_pixel == 1) { /* base is already correct */ @@ -1363,10 +1336,8 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, cirrusfb_WaitBLT(cinfo->regbase); /* lower 8 + 8 bits of screen start address */ - vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO, - (unsigned char) (base & 0xff)); - vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI, - (unsigned char) (base >> 8)); + vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO, base & 0xff); + vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI, (base >> 8) & 0xff); /* 0xf2 is %11110010, exclude tmp bits */ tmp = vga_rcrt(cinfo->regbase, CL_CRT1B) & 0xf2; @@ -1544,10 +1515,6 @@ static void init_vgachip(struct fb_info *info) /* FullBandwidth (video off) and 8/9 dot clock */ vga_wseq(cinfo->regbase, VGA_SEQ_CLOCK_MODE, 0x21); - /* polarity (-/-), disable access to display memory, - * VGA_CRTC_START_HI base address: color - */ - WGen(cinfo, VGA_MIS_W, 0xc1); /* "magic cookie" - doesn't make any sense to me.. */ /* vga_wgfx(cinfo->regbase, CL_GRA, 0xce); */ @@ -1614,10 +1581,6 @@ static void init_vgachip(struct fb_info *info) vga_wcrt(cinfo->regbase, VGA_CRTC_CURSOR_START, 0x20); /* Text cursor end: - */ vga_wcrt(cinfo->regbase, VGA_CRTC_CURSOR_END, 0x00); - /* Screen start address high: 0 */ - vga_wcrt(cinfo->regbase, VGA_CRTC_START_HI, 0x00); - /* Screen start address low: 0 */ - vga_wcrt(cinfo->regbase, VGA_CRTC_START_LO, 0x00); /* text cursor location high: 0 */ vga_wcrt(cinfo->regbase, VGA_CRTC_CURSOR_HI, 0x00); /* text cursor location low: 0 */ @@ -1625,10 +1588,6 @@ static void init_vgachip(struct fb_info *info) /* Underline Row scanline: - */ vga_wcrt(cinfo->regbase, VGA_CRTC_UNDERLINE, 0x00); - /* mode control: timing enable, byte mode, no compat modes */ - vga_wcrt(cinfo->regbase, VGA_CRTC_MODE, 0xc3); - /* Line Compare: not needed */ - vga_wcrt(cinfo->regbase, VGA_CRTC_LINE_COMPARE, 0x00); /* ### add 0x40 for text modes with > 30 MHz pixclock */ /* ext. display controls: ext.adr. wrap */ vga_wcrt(cinfo->regbase, CL_CRT1B, 0x02); @@ -1697,12 +1656,6 @@ static void init_vgachip(struct fb_info *info) WGen(cinfo, VGA_PEL_MSK, 0xff); /* Pixel mask: no mask */ - if (cinfo->btype != BT_ALPINE && cinfo->btype != BT_GD5480) - /* polarity (-/-), enable display mem, - * VGA_CRTC_START_HI i/o base = color - */ - WGen(cinfo, VGA_MIS_W, 0xc3); - /* BLT Start/status: Blitter reset */ vga_wgfx(cinfo->regbase, CL_GR31, 0x04); /* - " - : "end-of-reset" */ @@ -2052,7 +2005,7 @@ static int __devinit cirrusfb_register(struct fb_info *info) info->var.activate = FB_ACTIVATE_NOW; - err = cirrusfb_decode_var(&info->var, info); + err = cirrusfb_check_var(&info->var, info); if (err < 0) { /* should never happen */ dev_dbg(info->device, From 78d780e07247d52d3943b019bf9459bc9e95de1e Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:10 -0700 Subject: [PATCH 426/478] cirrusfb: various Laguna fixes - The Laguna GD5465 (AGP) has one register more than non-AGP chips. Recognize the AGP version and write a tile control register only on the AGP version. Tested only on an AGP card. - Bump up RAMDAC frequencies after X11 code. This allow to drive a flat panel resolution 1680x1050 at 16bpp from the 4MB card. - Fix screen start address overflow bits on Laguna cards (CRT1D register). - Fix exit path in the cirrusfb_pci_register() in case of error. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 84 +++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index bab713b63a0c..cac4e2b0cbae 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -102,7 +102,8 @@ enum cirrus_board { BT_PICASSO4, /* GD5446 */ BT_ALPINE, /* GD543x/4x */ BT_GD5480, - BT_LAGUNA, /* GD546x */ + BT_LAGUNA, /* GD5462/64 */ + BT_LAGUNAB, /* GD5465 */ }; /* @@ -234,8 +235,18 @@ static const struct cirrusfb_board_info_rec { [BT_LAGUNA] = { .name = "CL Laguna", .maxclock = { - /* guess */ - 135100, 135100, 135100, 135100, 135100, + /* taken from X11 code */ + 170000, 170000, 170000, 170000, 135100, + }, + .init_sr07 = false, + .init_sr1f = false, + .scrn_start_bit19 = true, + }, + [BT_LAGUNAB] = { + .name = "CL Laguna AGP", + .maxclock = { + /* taken from X11 code */ + 170000, 250000, 170000, 170000, 135100, }, .init_sr07 = false, .init_sr1f = false, @@ -258,7 +269,7 @@ static struct pci_device_id cirrusfb_pci_table[] = { CHIP(PCI_DEVICE_ID_CIRRUS_5446, BT_PICASSO4), /* Picasso 4 is 5446 */ CHIP(PCI_DEVICE_ID_CIRRUS_5462, BT_LAGUNA), /* CL Laguna */ CHIP(PCI_DEVICE_ID_CIRRUS_5464, BT_LAGUNA), /* CL Laguna 3D */ - CHIP(PCI_DEVICE_ID_CIRRUS_5465, BT_LAGUNA), /* CL Laguna 3DA*/ + CHIP(PCI_DEVICE_ID_CIRRUS_5465, BT_LAGUNAB), /* CL Laguna 3DA*/ { 0, } }; MODULE_DEVICE_TABLE(pci, cirrusfb_pci_table); @@ -385,6 +396,11 @@ static void cirrusfb_dbg_print_regs(struct fb_info *info, /*****************************************************************************/ /*** BEGIN Interface Used by the World ***************************************/ +static inline int is_laguna(const struct cirrusfb_info *cinfo) +{ + return cinfo->btype == BT_LAGUNA || cinfo->btype == BT_LAGUNAB; +} + static int opencount; /*--- Open /dev/fbx ---------------------------------------------------------*/ @@ -814,13 +830,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info) cirrusfb_set_mclk_as_source(info, divMCLK); } } - if (cinfo->btype == BT_LAGUNA) { + if (is_laguna(cinfo)) { long pcifc = fb_readl(cinfo->laguna_mmio + 0x3fc); unsigned char tile = fb_readb(cinfo->laguna_mmio + 0x407); unsigned short tile_control; - tile_control = fb_readw(cinfo->laguna_mmio + 0x2c4); - fb_writew(tile_control & ~0x80, cinfo->laguna_mmio + 0x2c4); + if (cinfo->btype == BT_LAGUNAB) { + tile_control = fb_readw(cinfo->laguna_mmio + 0x2c4); + tile_control &= ~0x80; + fb_writew(tile_control, cinfo->laguna_mmio + 0x2c4); + } fb_writel(pcifc | 0x10000000l, cinfo->laguna_mmio + 0x3fc); fb_writeb(tile & 0x3f, cinfo->laguna_mmio + 0x407); @@ -842,7 +861,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) dev_dbg(info->device, "CL_SEQR1B: %d\n", (int) tmp); /* Laguna chipset has reversed clock registers */ - if (cinfo->btype == BT_LAGUNA) { + if (is_laguna(cinfo)) { vga_wseq(regbase, CL_SEQRE, tmp); vga_wseq(regbase, CL_SEQR1E, nom); } else { @@ -873,7 +892,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) tmp |= 0x40; if (var->sync & FB_SYNC_VERT_HIGH_ACT) tmp |= 0x80; - if (cinfo->btype == BT_LAGUNA) + if (is_laguna(cinfo)) tmp |= 0xc; WGen(cinfo, VGA_MIS_W, tmp); @@ -908,6 +927,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_LAGUNA: + case BT_LAGUNAB: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); break; @@ -947,6 +967,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_ALPINE: case BT_GD5480: case BT_LAGUNA: + case BT_LAGUNAB: /* do nothing */ break; @@ -991,6 +1012,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_LAGUNA: + case BT_LAGUNAB: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) | 0x01); threshold |= 0x10; @@ -1030,6 +1052,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_GD5480: case BT_LAGUNA: + case BT_LAGUNAB: /* do nothing */ break; @@ -1096,6 +1119,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_LAGUNA: + case BT_LAGUNAB: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); control |= 0x2000; @@ -1166,6 +1190,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_LAGUNA: + case BT_LAGUNAB: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); control |= 0x6000; @@ -1208,7 +1233,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) vga_wcrt(regbase, CL_CRT1D, (pitch >> 9) & 1); - if (cinfo->btype == BT_LAGUNA) { + if (is_laguna(cinfo)) { tmp = 0; if ((htotal + 5) & 256) tmp |= 128; @@ -1234,7 +1259,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* From SetOffset(): Turn on VideoEnable bit in Attribute controller */ AttrOn(cinfo); - if (cinfo->btype == BT_LAGUNA) { + if (is_laguna(cinfo)) { /* no tiles */ fb_writew(control | 0x1000, cinfo->laguna_mmio + 0x402); fb_writew(format, cinfo->laguna_mmio + 0xc0); @@ -1332,7 +1357,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, xpix = (unsigned char) ((xoffset % 4) * 2); } - if (cinfo->btype != BT_LAGUNA) + if (!is_laguna(cinfo)) cirrusfb_WaitBLT(cinfo->regbase); /* lower 8 + 8 bits of screen start address */ @@ -1353,8 +1378,11 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, /* construct bit 19 of screen start address */ if (cirrusfb_board_info[cinfo->btype].scrn_start_bit19) { - tmp = vga_rcrt(cinfo->regbase, CL_CRT1D) & ~0x80; - tmp |= (base >> 12) & 0x80; + tmp = vga_rcrt(cinfo->regbase, CL_CRT1D); + if (is_laguna(cinfo)) + tmp = (tmp & ~0x18) | ((base >> 16) & 0x18); + else + tmp = (tmp & ~0x80) | ((base >> 12) & 0x80); vga_wcrt(cinfo->regbase, CL_CRT1D, tmp); } @@ -1365,7 +1393,7 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, if (info->var.bits_per_pixel == 1) vga_wattr(cinfo->regbase, CL_AR33, xpix); - if (cinfo->btype != BT_LAGUNA) + if (!is_laguna(cinfo)) cirrusfb_WaitBLT(cinfo->regbase); return 0; @@ -1486,6 +1514,7 @@ static void init_vgachip(struct fb_info *info) break; case BT_LAGUNA: + case BT_LAGUNAB: case BT_ALPINE: /* Nothing to do to reset the board. */ break; @@ -1530,6 +1559,7 @@ static void init_vgachip(struct fb_info *info) break; case BT_ALPINE: case BT_LAGUNA: + case BT_LAGUNAB: break; case BT_SD64: vga_wseq(cinfo->regbase, CL_SEQRF, 0xb8); @@ -1611,7 +1641,7 @@ static void init_vgachip(struct fb_info *info) /* Bit Mask: no mask at all */ vga_wgfx(cinfo->regbase, VGA_GFX_BIT_MASK, 0xff); - if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_LAGUNA) + if (cinfo->btype == BT_ALPINE || is_laguna(cinfo)) /* (5434 can't have bit 3 set for bitblt) */ vga_wgfx(cinfo->regbase, CL_GRB, 0x20); else @@ -1802,7 +1832,7 @@ static void cirrusfb_imageblit(struct fb_info *info, { struct cirrusfb_info *cinfo = info->par; - if (cinfo->btype != BT_LAGUNA) + if (!is_laguna(cinfo)) cirrusfb_WaitBLT(cinfo->regbase); cfb_imageblit(info, image); } @@ -1831,7 +1861,7 @@ static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info, unsigned long mem; struct cirrusfb_info *cinfo = info->par; - if (cinfo->btype == BT_LAGUNA) { + if (is_laguna(cinfo)) { unsigned char SR14 = vga_rseq(regbase, CL_SEQR14); mem = ((SR14 & 7) + 1) << 20; @@ -1950,7 +1980,7 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info) | FBINFO_HWACCEL_YPAN | FBINFO_HWACCEL_FILLRECT | FBINFO_HWACCEL_COPYAREA; - if (noaccel || cinfo->btype == BT_LAGUNA) + if (noaccel || is_laguna(cinfo)) info->flags |= FBINFO_HWACCEL_DISABLED; info->fbops = &cirrusfb_ops; if (cinfo->btype == BT_GD5480) { @@ -2060,7 +2090,7 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, if (!info) { printk(KERN_ERR "cirrusfb: could not allocate memory\n"); ret = -ENOMEM; - goto err_disable; + goto err_out; } cinfo = info->par; @@ -2127,10 +2157,11 @@ static int __devinit cirrusfb_pci_register(struct pci_dev *pdev, pci_set_drvdata(pdev, info); ret = cirrusfb_register(info); - if (ret) - iounmap(info->screen_base); - return ret; + if (!ret) + return 0; + pci_set_drvdata(pdev, NULL); + iounmap(info->screen_base); err_release_legacy: if (release_io_ports) release_region(0x3C0, 32); @@ -2140,10 +2171,9 @@ err_release_regions: #endif pci_release_regions(pdev); err_release_fb: - if (cinfo->laguna_mmio == NULL) + if (cinfo->laguna_mmio != NULL) iounmap(cinfo->laguna_mmio); framebuffer_release(info); -err_disable: err_out: return ret; } @@ -2439,7 +2469,7 @@ static void WHDR(const struct cirrusfb_info *cinfo, unsigned char val) { unsigned char dummy; - if (cinfo->btype == BT_LAGUNA) + if (is_laguna(cinfo)) return; if (cinfo->btype == BT_PICASSO) { /* Klaus' hint for correct access to HDR on some boards */ @@ -2509,7 +2539,7 @@ static void WClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch if (cinfo->btype == BT_PICASSO || cinfo->btype == BT_PICASSO4 || cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480 || - cinfo->btype == BT_LAGUNA) { + is_laguna(cinfo)) { /* but DAC data register IS, at least for Picasso II */ if (cinfo->btype == BT_PICASSO) data += 0xfff; From 8343c89c4f1aac4fced7bb6919b0bdd0c13edcdc Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:11 -0700 Subject: [PATCH 427/478] cirrusfb: acceleration improvements - Fix color expansion for 16bpp and 32bpp modes in the cirrusfb_RectFill(). - Make a function with a common blitter code (cirrusfb_set_blitter). - Add fb_sync function to allow a higher layer synchronize with the blitter. - Kill one redundant blitter reset. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 188 ++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 112 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index cac4e2b0cbae..e4ac2f6d4d97 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -381,7 +381,7 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, u_short x, u_short y, u_short width, u_short height, - u_char color, u_short line_length); + u32 color, u_short line_length); static void bestclock(long freq, int *nom, int *den, int *div); @@ -1550,9 +1550,6 @@ static void init_vgachip(struct fb_info *info) /* unlock all extension registers */ vga_wseq(cinfo->regbase, CL_SEQR6, 0x12); - /* reset blitter */ - vga_wgfx(cinfo->regbase, CL_GR31, 0x04); - switch (cinfo->btype) { case BT_GD5480: vga_wseq(cinfo->regbase, CL_SEQRF, 0x98); @@ -1747,6 +1744,17 @@ static void switch_monitor(struct cirrusfb_info *cinfo, int on) /* Linux 2.6-style accelerated functions */ /******************************************/ +static int cirrusfb_sync(struct fb_info *info) +{ + struct cirrusfb_info *cinfo = info->par; + + if (!is_laguna(cinfo)) { + while (vga_rgfx(cinfo->regbase, CL_GR31) & 0x03) + cpu_relax(); + } + return 0; +} + static void cirrusfb_fillrect(struct fb_info *info, const struct fb_fillrect *region) { @@ -1966,6 +1974,7 @@ static struct fb_ops cirrusfb_ops = { .fb_blank = cirrusfb_blank, .fb_fillrect = cirrusfb_fillrect, .fb_copyarea = cirrusfb_copyarea, + .fb_sync = cirrusfb_sync, .fb_imageblit = cirrusfb_imageblit, }; @@ -2586,7 +2595,6 @@ static void RClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch /* FIXME: use interrupts instead */ static void cirrusfb_WaitBLT(u8 __iomem *regbase) { - /* now busy-wait until we're done */ while (vga_rgfx(regbase, CL_GR31) & 0x08) cpu_relax(); } @@ -2597,58 +2605,12 @@ static void cirrusfb_WaitBLT(u8 __iomem *regbase) perform accelerated "scrolling" ********************************************************************/ -static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, - u_short curx, u_short cury, - u_short destx, u_short desty, - u_short width, u_short height, - u_short line_length) +static void cirrusfb_set_blitter(u8 __iomem *regbase, + u_short nwidth, u_short nheight, + u_long nsrc, u_long ndest, + u_short bltmode, u_short line_length) + { - u_short nwidth, nheight; - u_long nsrc, ndest; - u_char bltmode; - - nwidth = width - 1; - nheight = height - 1; - - bltmode = 0x00; - /* if source adr < dest addr, do the Blt backwards */ - if (cury <= desty) { - if (cury == desty) { - /* if src and dest are on the same line, check x */ - if (curx < destx) - bltmode |= 0x01; - } else - bltmode |= 0x01; - } - if (!bltmode) { - /* standard case: forward blitting */ - nsrc = (cury * line_length) + curx; - ndest = (desty * line_length) + destx; - } else { - /* this means start addresses are at the end, - * counting backwards - */ - nsrc = cury * line_length + curx + - nheight * line_length + nwidth; - ndest = desty * line_length + destx + - nheight * line_length + nwidth; - } - - /* - run-down of registers to be programmed: - destination pitch - source pitch - BLT width/height - source start - destination start - BLT mode - BLT ROP - VGA_GFX_SR_VALUE / VGA_GFX_SR_ENABLE: "fill color" - start/stop - */ - - cirrusfb_WaitBLT(regbase); - /* pitch: set to line_length */ /* dest pitch low */ vga_wgfx(regbase, CL_GR24, line_length & 0xff); @@ -2694,7 +2656,51 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, vga_wgfx(regbase, CL_GR32, 0x0d); /* BLT ROP */ /* and finally: GO! */ - vga_wgfx(regbase, CL_GR31, 0x02); /* BLT Start/status */ + vga_wgfx(regbase, CL_GR31, 0x82); /* BLT Start/status */ +} + +/******************************************************************* + cirrusfb_BitBLT() + + perform accelerated "scrolling" +********************************************************************/ + +static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, + u_short curx, u_short cury, + u_short destx, u_short desty, + u_short width, u_short height, + u_short line_length) +{ + u_short nwidth = width - 1; + u_short nheight = height - 1; + u_long nsrc, ndest; + u_char bltmode; + + bltmode = 0x00; + /* if source adr < dest addr, do the Blt backwards */ + if (cury <= desty) { + if (cury == desty) { + /* if src and dest are on the same line, check x */ + if (curx < destx) + bltmode |= 0x01; + } else + bltmode |= 0x01; + } + /* standard case: forward blitting */ + nsrc = (cury * line_length) + curx; + ndest = (desty * line_length) + destx; + if (bltmode) { + /* this means start addresses are at the end, + * counting backwards + */ + nsrc += nheight * line_length + nwidth; + ndest += nheight * line_length + nwidth; + } + + cirrusfb_WaitBLT(regbase); + + cirrusfb_set_blitter(regbase, nwidth, nheight, + nsrc, ndest, bltmode, line_length); } /******************************************************************* @@ -2705,75 +2711,33 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, u_short x, u_short y, u_short width, u_short height, - u_char color, u_short line_length) + u32 color, u_short line_length) { - u_short nwidth, nheight; - u_long ndest; + u_long ndest = (y * line_length) + x; u_char op; - nwidth = width - 1; - nheight = height - 1; - - ndest = (y * line_length) + x; - cirrusfb_WaitBLT(regbase); - /* pitch: set to line_length */ - vga_wgfx(regbase, CL_GR24, line_length & 0xff); /* dest pitch low */ - vga_wgfx(regbase, CL_GR25, line_length >> 8); /* dest pitch hi */ - vga_wgfx(regbase, CL_GR26, line_length & 0xff); /* source pitch low */ - vga_wgfx(regbase, CL_GR27, line_length >> 8); /* source pitch hi */ - - /* BLT width: actual number of pixels - 1 */ - vga_wgfx(regbase, CL_GR20, nwidth & 0xff); /* BLT width low */ - vga_wgfx(regbase, CL_GR21, nwidth >> 8); /* BLT width hi */ - - /* BLT height: actual number of lines -1 */ - vga_wgfx(regbase, CL_GR22, nheight & 0xff); /* BLT height low */ - vga_wgfx(regbase, CL_GR23, nheight >> 8); /* BLT width hi */ - - /* BLT destination */ - /* BLT dest low */ - vga_wgfx(regbase, CL_GR28, (u_char) (ndest & 0xff)); - /* BLT dest mid */ - vga_wgfx(regbase, CL_GR29, (u_char) (ndest >> 8)); - /* BLT dest hi */ - vga_wgfx(regbase, CL_GR2A, (u_char) (ndest >> 16)); - - /* BLT source: set to 0 (is a dummy here anyway) */ - vga_wgfx(regbase, CL_GR2C, 0x00); /* BLT src low */ - vga_wgfx(regbase, CL_GR2D, 0x00); /* BLT src mid */ - vga_wgfx(regbase, CL_GR2E, 0x00); /* BLT src hi */ - /* This is a ColorExpand Blt, using the */ /* same color for foreground and background */ vga_wgfx(regbase, VGA_GFX_SR_VALUE, color); /* foreground color */ vga_wgfx(regbase, VGA_GFX_SR_ENABLE, color); /* background color */ op = 0xc0; - if (bits_per_pixel == 16) { - vga_wgfx(regbase, CL_GR10, color); /* foreground color */ - vga_wgfx(regbase, CL_GR11, color); /* background color */ - op = 0x50; + if (bits_per_pixel >= 16) { + vga_wgfx(regbase, CL_GR10, color >> 8); /* foreground color */ + vga_wgfx(regbase, CL_GR11, color >> 8); /* background color */ op = 0xd0; - } else if (bits_per_pixel == 32) { - vga_wgfx(regbase, CL_GR10, color); /* foreground color */ - vga_wgfx(regbase, CL_GR11, color); /* background color */ - vga_wgfx(regbase, CL_GR12, color); /* foreground color */ - vga_wgfx(regbase, CL_GR13, color); /* background color */ - vga_wgfx(regbase, CL_GR14, 0); /* foreground color */ - vga_wgfx(regbase, CL_GR15, 0); /* background color */ - op = 0x50; + } + if (bits_per_pixel == 32) { + vga_wgfx(regbase, CL_GR12, color >> 16);/* foreground color */ + vga_wgfx(regbase, CL_GR13, color >> 16);/* background color */ + vga_wgfx(regbase, CL_GR14, color >> 24);/* foreground color */ + vga_wgfx(regbase, CL_GR15, color >> 24);/* background color */ op = 0xf0; } - /* BLT mode: color expand, Enable 8x8 copy (faster?) */ - vga_wgfx(regbase, CL_GR30, op); /* BLT mode */ - - /* BLT ROP: SrcCopy */ - vga_wgfx(regbase, CL_GR32, 0x0d); /* BLT ROP */ - - /* and finally: GO! */ - vga_wgfx(regbase, CL_GR31, 0x02); /* BLT Start/status */ + cirrusfb_set_blitter(regbase, width - 1, height - 1, + 0, ndest, op, line_length); } /************************************************************************** From 9e848062533207130667f6eaa748549367ccbedf Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:11 -0700 Subject: [PATCH 428/478] cirrusfb: add imageblit function Add hardware color expansion (imageblit) function. It roughly doubles scrolling speed of my Alpine card (GD5430). Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 64 +++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index e4ac2f6d4d97..a2a09b39c59a 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -381,7 +381,8 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, u_short x, u_short y, u_short width, u_short height, - u32 color, u_short line_length); + u32 fg_color, u32 bg_color, + u_short line_length, u_char blitmode); static void bestclock(long freq, int *nom, int *den, int *div); @@ -1790,8 +1791,8 @@ static void cirrusfb_fillrect(struct fb_info *info, info->var.bits_per_pixel, (region->dx * m) / 8, region->dy, (region->width * m) / 8, region->height, - color, - info->fix.line_length); + color, color, + info->fix.line_length, 0x40); } static void cirrusfb_copyarea(struct fb_info *info, @@ -1840,9 +1841,33 @@ static void cirrusfb_imageblit(struct fb_info *info, { struct cirrusfb_info *cinfo = info->par; - if (!is_laguna(cinfo)) + if (info->state != FBINFO_STATE_RUNNING) + return; + if (info->flags & FBINFO_HWACCEL_DISABLED) + cfb_imageblit(info, image); + else { + unsigned size = ((image->width + 7) >> 3) * image->height; + int m = info->var.bits_per_pixel; + u32 fg, bg; + + if (info->var.bits_per_pixel == 8) { + fg = image->fg_color; + bg = image->bg_color; + } else { + fg = ((u32 *)(info->pseudo_palette))[image->fg_color]; + bg = ((u32 *)(info->pseudo_palette))[image->bg_color]; + } cirrusfb_WaitBLT(cinfo->regbase); - cfb_imageblit(info, image); + /* byte rounded scanlines */ + vga_wgfx(cinfo->regbase, CL_GR33, 0x00); + cirrusfb_RectFill(cinfo->regbase, + info->var.bits_per_pixel, + (image->dx * m) / 8, image->dy, + (image->width * m) / 8, image->height, + fg, bg, + info->fix.line_length, 0x04); + memcpy(info->screen_base, image->data, size); + } } #ifdef CONFIG_PPC_PREP @@ -1988,10 +2013,12 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info) | FBINFO_HWACCEL_XPAN | FBINFO_HWACCEL_YPAN | FBINFO_HWACCEL_FILLRECT + | FBINFO_HWACCEL_IMAGEBLIT | FBINFO_HWACCEL_COPYAREA; if (noaccel || is_laguna(cinfo)) info->flags |= FBINFO_HWACCEL_DISABLED; info->fbops = &cirrusfb_ops; + if (cinfo->btype == BT_GD5480) { if (var->bits_per_pixel == 16) info->screen_base += 1 * MB_; @@ -2711,7 +2738,8 @@ static void cirrusfb_BitBLT(u8 __iomem *regbase, int bits_per_pixel, static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, u_short x, u_short y, u_short width, u_short height, - u32 color, u_short line_length) + u32 fg_color, u32 bg_color, u_short line_length, + u_char blitmode) { u_long ndest = (y * line_length) + x; u_char op; @@ -2720,24 +2748,24 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, /* This is a ColorExpand Blt, using the */ /* same color for foreground and background */ - vga_wgfx(regbase, VGA_GFX_SR_VALUE, color); /* foreground color */ - vga_wgfx(regbase, VGA_GFX_SR_ENABLE, color); /* background color */ + vga_wgfx(regbase, VGA_GFX_SR_VALUE, bg_color); + vga_wgfx(regbase, VGA_GFX_SR_ENABLE, fg_color); - op = 0xc0; + op = 0x80; if (bits_per_pixel >= 16) { - vga_wgfx(regbase, CL_GR10, color >> 8); /* foreground color */ - vga_wgfx(regbase, CL_GR11, color >> 8); /* background color */ - op = 0xd0; + vga_wgfx(regbase, CL_GR10, bg_color >> 8); + vga_wgfx(regbase, CL_GR11, fg_color >> 8); + op = 0x90; } if (bits_per_pixel == 32) { - vga_wgfx(regbase, CL_GR12, color >> 16);/* foreground color */ - vga_wgfx(regbase, CL_GR13, color >> 16);/* background color */ - vga_wgfx(regbase, CL_GR14, color >> 24);/* foreground color */ - vga_wgfx(regbase, CL_GR15, color >> 24);/* background color */ - op = 0xf0; + vga_wgfx(regbase, CL_GR12, bg_color >> 16); + vga_wgfx(regbase, CL_GR13, fg_color >> 16); + vga_wgfx(regbase, CL_GR14, bg_color >> 24); + vga_wgfx(regbase, CL_GR15, fg_color >> 24); + op = 0xb0; } cirrusfb_set_blitter(regbase, width - 1, height - 1, - 0, ndest, op, line_length); + 0, ndest, op | blitmode, line_length); } /************************************************************************** From bc5d8ac02f24d68efe8e267c96dd75c0531009ab Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:12 -0700 Subject: [PATCH 429/478] cirrusfb: fix error paths in cirrusfb_xxx_register() Balance iomap and iounmap and alloc and free calls in case of error druing device register (probing). Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index a2a09b39c59a..ffc514df2452 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -2090,8 +2090,6 @@ static int __devinit cirrusfb_register(struct fb_info *info) err_dealloc_cmap: fb_dealloc_cmap(&info->cmap); - cinfo->unmap(info); - framebuffer_release(info); return err; } @@ -2328,18 +2326,15 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, zorro_set_drvdata(z, info); ret = cirrusfb_register(info); - if (ret) { - if (btype == BT_PICASSO4) { - iounmap(info->screen_base); - iounmap(cinfo->regbase - 0x600000); - } else if (board_addr > 0x01000000) - iounmap(info->screen_base); - } - return ret; + if (!ret) + return 0; + + if (btype == BT_PICASSO4 || board_addr > 0x01000000) + iounmap(info->screen_base); err_unmap_regbase: - /* Parental advisory: explicit hack */ - iounmap(cinfo->regbase - 0x600000); + if (btype == BT_PICASSO4) + iounmap(cinfo->regbase - 0x600000); err_release_region: release_region(board_addr, board_size); err_release_fb: From 527410ff7fc5d45fe41523c0ba061113dea22017 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:13 -0700 Subject: [PATCH 430/478] cirrusfb: GD5446 fixes Various fixes to make Cirrus GD5446 chip work. Another Cirrus chip works with the cirrusfb. The gd5446 seems very similar to Alpine chips. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index ffc514df2452..6603273f4ce5 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -198,9 +198,11 @@ static const struct cirrusfb_board_info_rec { .init_sr07 = true, .init_sr1f = false, .scrn_start_bit19 = true, - .sr07 = 0x20, - .sr07_1bpp = 0x20, - .sr07_8bpp = 0x21, + .sr07 = 0xA0, + .sr07_1bpp = 0xA0, + .sr07_1bpp_mux = 0xA6, + .sr07_8bpp = 0xA1, + .sr07_8bpp_mux = 0xA7, .sr1f = 0 }, [BT_ALPINE] = { @@ -213,8 +215,8 @@ static const struct cirrusfb_board_info_rec { .init_sr1f = true, .scrn_start_bit19 = true, .sr07 = 0xA0, - .sr07_1bpp = 0xA1, - .sr07_1bpp_mux = 0xA7, + .sr07_1bpp = 0xA0, + .sr07_1bpp_mux = 0xA6, .sr07_8bpp = 0xA1, .sr07_8bpp_mux = 0xA7, .sr1f = 0x1C @@ -821,7 +823,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* formula: VClk = (OSC * N) / (D * (1+P)) */ /* Example: VClk = (14.31818 * 91) / (23 * (1+1)) = 28.325 MHz */ - if (cinfo->btype == BT_ALPINE) { + if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_PICASSO4) { /* if freq is close to mclk or mclk/2 select mclk * as clock source */ @@ -1044,9 +1046,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* ### INCOMPLETE!! */ vga_wseq(regbase, CL_SEQRF, 0xb8); #endif -/* vga_wseq(regbase, CL_SEQR1F, 0x1c); */ - break; - case BT_ALPINE: /* We already set SRF and SR1F */ break; @@ -1106,10 +1105,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_PICASSO4: - vga_wseq(regbase, CL_SEQR7, 0x27); -/* vga_wseq(regbase, CL_SEQR1F, 0x1c); */ - break; - case BT_ALPINE: vga_wseq(regbase, CL_SEQR7, 0xa7); break; @@ -1177,10 +1172,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case BT_PICASSO4: - vga_wseq(regbase, CL_SEQR7, 0x25); -/* vga_wseq(regbase, CL_SEQR1F, 0x1c); */ - break; - case BT_ALPINE: vga_wseq(regbase, CL_SEQR7, 0xa9); break; @@ -2678,7 +2669,7 @@ static void cirrusfb_set_blitter(u8 __iomem *regbase, vga_wgfx(regbase, CL_GR32, 0x0d); /* BLT ROP */ /* and finally: GO! */ - vga_wgfx(regbase, CL_GR31, 0x82); /* BLT Start/status */ + vga_wgfx(regbase, CL_GR31, 0x02); /* BLT Start/status */ } /******************************************************************* From 7cade31cabec33c396b1dfd9c2842e793c2648ef Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:13 -0700 Subject: [PATCH 431/478] cirrusfb: use 24bpp instead of 32bpp The 32bpp is supported only on the latest Cirrus Logic chips. Use the 24bpp which is supported at least since Alpine chips (GD543x). Change 32bpp mode setting to 24bpp mode. Change acceleration as well. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 79 ++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 6603273f4ce5..65a1831ddd0d 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -95,10 +95,10 @@ /* board types */ enum cirrus_board { BT_NONE = 0, - BT_SD64, - BT_PICCOLO, - BT_PICASSO, - BT_SPECTRUM, + BT_SD64, /* GD5434 */ + BT_PICCOLO, /* GD5426 */ + BT_PICASSO, /* GD5426 or GD5428 */ + BT_SPECTRUM, /* GD5426 or GD5428 */ BT_PICASSO4, /* GD5446 */ BT_ALPINE, /* GD543x/4x */ BT_GD5480, @@ -488,7 +488,7 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var, * the VCLK is double the pixel clock. */ switch (var->bits_per_pixel) { case 16: - case 32: + case 24: if (var->xres <= 800) /* Xbh has this type of clock for 32-bit */ freq /= 2; @@ -535,11 +535,11 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, var->blue.length = 5; break; - case 32: + case 24: if (isPReP) { - var->red.offset = 8; - var->green.offset = 16; - var->blue.offset = 24; + var->red.offset = 0; + var->green.offset = 8; + var->blue.offset = 16; } else { var->red.offset = 16; var->green.offset = 8; @@ -670,7 +670,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) break; case 16: - case 32: + case 24: info->fix.line_length = var->xres_virtual * var->bits_per_pixel >> 3; info->fix.visual = FB_VISUAL_TRUECOLOR; @@ -813,6 +813,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wcrt(regbase, CL_CRT1A, tmp); freq = PICOS2KHZ(var->pixclock); + if (cinfo->btype == BT_ALPINE && var->bits_per_pixel == 24) + freq *= 3; + bestclock(freq, &nom, &den, &div); dev_dbg(info->device, "VCLK freq: %ld kHz nom: %d den: %d div: %d\n", @@ -1140,16 +1143,16 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /****************************************************** * - * 32 bpp + * 24 bpp * */ - else if (var->bits_per_pixel == 32) { - dev_dbg(info->device, "preparing for 32 bit deep display\n"); + else if (var->bits_per_pixel == 24) { + dev_dbg(info->device, "preparing for 24 bit deep display\n"); switch (cinfo->btype) { case BT_SD64: /* Extended Sequencer Mode: 256c col. mode */ - vga_wseq(regbase, CL_SEQR7, 0xf9); + vga_wseq(regbase, CL_SEQR7, 0xf5); /* MCLK select */ vga_wseq(regbase, CL_SEQR1F, 0x1e); break; @@ -1173,11 +1176,11 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_PICASSO4: case BT_ALPINE: - vga_wseq(regbase, CL_SEQR7, 0xa9); + vga_wseq(regbase, CL_SEQR7, 0xa5); break; case BT_GD5480: - vga_wseq(regbase, CL_SEQR7, 0x19); + vga_wseq(regbase, CL_SEQR7, 0x15); /* We already set SRF and SR1F */ break; @@ -1185,8 +1188,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_LAGUNAB: vga_wseq(regbase, CL_SEQR7, vga_rseq(regbase, CL_SEQR7) & ~0x01); - control |= 0x6000; - format |= 0x3400; + control |= 0x4000; + format |= 0x2400; threshold |= 0x20; break; @@ -1385,9 +1388,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, if (info->var.bits_per_pixel == 1) vga_wattr(cinfo->regbase, CL_AR33, xpix); - if (!is_laguna(cinfo)) - cirrusfb_WaitBLT(cinfo->regbase); - return 0; } @@ -1492,22 +1492,18 @@ static void init_vgachip(struct fb_info *info) /* disable flickerfixer */ vga_wcrt(cinfo->regbase, CL_CRT51, 0x00); mdelay(100); - /* from Klaus' NetBSD driver: */ - vga_wgfx(cinfo->regbase, CL_GR2F, 0x00); - /* put blitter into 542x compat */ - vga_wgfx(cinfo->regbase, CL_GR33, 0x00); /* mode */ vga_wgfx(cinfo->regbase, CL_GR31, 0x00); - break; - - case BT_GD5480: + case BT_GD5480: /* fall through */ /* from Klaus' NetBSD driver: */ vga_wgfx(cinfo->regbase, CL_GR2F, 0x00); + case BT_ALPINE: /* fall through */ + /* put blitter into 542x compat */ + vga_wgfx(cinfo->regbase, CL_GR33, 0x00); break; case BT_LAGUNA: case BT_LAGUNAB: - case BT_ALPINE: /* Nothing to do to reset the board. */ break; @@ -1831,10 +1827,13 @@ static void cirrusfb_imageblit(struct fb_info *info, const struct fb_image *image) { struct cirrusfb_info *cinfo = info->par; + unsigned char op = (info->var.bits_per_pixel == 24) ? 0xc : 0x4; if (info->state != FBINFO_STATE_RUNNING) return; - if (info->flags & FBINFO_HWACCEL_DISABLED) + /* Alpine acceleration does not work at 24bpp ?!? */ + if (info->flags & FBINFO_HWACCEL_DISABLED || image->depth != 1 || + (cinfo->btype == BT_ALPINE && op == 0xc)) cfb_imageblit(info, image); else { unsigned size = ((image->width + 7) >> 3) * image->height; @@ -1848,15 +1847,22 @@ static void cirrusfb_imageblit(struct fb_info *info, fg = ((u32 *)(info->pseudo_palette))[image->fg_color]; bg = ((u32 *)(info->pseudo_palette))[image->bg_color]; } - cirrusfb_WaitBLT(cinfo->regbase); - /* byte rounded scanlines */ - vga_wgfx(cinfo->regbase, CL_GR33, 0x00); + if (info->var.bits_per_pixel == 24) { + /* clear background first */ + cirrusfb_RectFill(cinfo->regbase, + info->var.bits_per_pixel, + (image->dx * m) / 8, image->dy, + (image->width * m) / 8, + image->height, + bg, bg, + info->fix.line_length, 0x40); + } cirrusfb_RectFill(cinfo->regbase, info->var.bits_per_pixel, (image->dx * m) / 8, image->dy, (image->width * m) / 8, image->height, fg, bg, - info->fix.line_length, 0x04); + info->fix.line_length, op); memcpy(info->screen_base, image->data, size); } } @@ -2743,9 +2749,12 @@ static void cirrusfb_RectFill(u8 __iomem *regbase, int bits_per_pixel, vga_wgfx(regbase, CL_GR11, fg_color >> 8); op = 0x90; } - if (bits_per_pixel == 32) { + if (bits_per_pixel >= 24) { vga_wgfx(regbase, CL_GR12, bg_color >> 16); vga_wgfx(regbase, CL_GR13, fg_color >> 16); + op = 0xa0; + } + if (bits_per_pixel == 32) { vga_wgfx(regbase, CL_GR14, bg_color >> 24); vga_wgfx(regbase, CL_GR15, fg_color >> 24); op = 0xb0; From dd14f71cc62dd07b588cc6de935155e6fd3911c9 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:14 -0700 Subject: [PATCH 432/478] cirrusfb: fix clock doubling Cirrus' Alpine and Picasso4 chips uses DAC clock doubling to achieve full range of pixclock frequencies. [akpm@linux-foundation.org: fix spelling, use usual comment layout] Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 65a1831ddd0d..15e2e6bfcbff 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -470,10 +470,25 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var, /* If the frequency is greater than we can support, we might be able * to use multiplexing for the video mode */ if (freq > maxclock) { + dev_err(info->device, + "Frequency greater than maxclock (%ld kHz)\n", + maxclock); + return -EINVAL; + } + /* + * Additional constraint: 8bpp uses DAC clock doubling to allow maximum + * pixel clock + */ + if (var->bits_per_pixel == 8) { switch (cinfo->btype) { case BT_ALPINE: + case BT_PICASSO4: + if (freq > 85500) + cinfo->multiplexing = 1; + break; case BT_GD5480: - cinfo->multiplexing = 1; + if (freq > 135100) + cinfo->multiplexing = 1; break; default: @@ -815,6 +830,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) freq = PICOS2KHZ(var->pixclock); if (cinfo->btype == BT_ALPINE && var->bits_per_pixel == 24) freq *= 3; + if (cinfo->multiplexing) + freq /= 2; bestclock(freq, &nom, &den, &div); From 614c0dc93284404be2a4d5750c79bb95f2b6c980 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:15 -0700 Subject: [PATCH 433/478] cirrusfb: add accelerator constant Add an accelerator constant so almost all Cirrus are recognized as accelerators by the fbset command. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 11 +++++++++-- include/linux/fb.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 15e2e6bfcbff..e9a2661669eb 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -519,6 +519,7 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, int yres; /* memory size in pixels */ unsigned pixels = info->screen_size * 8 / var->bits_per_pixel; + struct cirrusfb_info *cinfo = info->par; switch (var->bits_per_pixel) { case 1: @@ -627,6 +628,9 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var, if (cirrusfb_check_pixclock(var, info)) return -EINVAL; + if (!is_laguna(cinfo)) + var->accel_flags = FB_ACCELF_TEXT; + return 0; } @@ -2029,8 +2033,12 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info) | FBINFO_HWACCEL_FILLRECT | FBINFO_HWACCEL_IMAGEBLIT | FBINFO_HWACCEL_COPYAREA; - if (noaccel || is_laguna(cinfo)) + if (noaccel || is_laguna(cinfo)) { info->flags |= FBINFO_HWACCEL_DISABLED; + info->fix.accel = FB_ACCEL_NONE; + } else + info->fix.accel = FB_ACCEL_CIRRUS_ALPINE; + info->fbops = &cirrusfb_ops; if (cinfo->btype == BT_GD5480) { @@ -2056,7 +2064,6 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info) /* FIXME: map region at 0xB8000 if available, fill in here */ info->fix.mmio_len = 0; - info->fix.accel = FB_ACCEL_NONE; fb_alloc_cmap(&info->cmap, 256, 0); diff --git a/include/linux/fb.h b/include/linux/fb.h index 31527e17076b..fe7d0d7907ab 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -123,6 +123,7 @@ struct dentry; #define FB_ACCEL_TRIDENT_3DIMAGE 51 /* Trident 3DImage */ #define FB_ACCEL_TRIDENT_BLADE3D 52 /* Trident Blade3D */ #define FB_ACCEL_TRIDENT_BLADEXP 53 /* Trident BladeXP */ +#define FB_ACCEL_CIRRUS_ALPINE 53 /* Cirrus Logic 543x/544x/5480 */ #define FB_ACCEL_NEOMAGIC_NM2070 90 /* NeoMagic NM2070 */ #define FB_ACCEL_NEOMAGIC_NM2090 91 /* NeoMagic NM2090 */ #define FB_ACCEL_NEOMAGIC_NM2093 92 /* NeoMagic NM2093 */ From 8f19e15b8ad23e28add5760ed049be2359f39fe8 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:15 -0700 Subject: [PATCH 434/478] cirrusfb: set MCLK in one place A memory clock (MCLK) is set at various places. Move the setting into one place. Set the MCLK only for Zorro cards as the x86 cards should be initialized by BIOS. Improve handling of the GD5434 (SD64). Kill one annoying debug output "virtual offset: ...". Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 70 +++++++++------------------------------- 1 file changed, 16 insertions(+), 54 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index e9a2661669eb..620327436082 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -146,7 +146,7 @@ static const struct cirrusfb_board_info_rec { .sr07 = 0xF0, .sr07_1bpp = 0xF0, .sr07_8bpp = 0xF1, - .sr1f = 0x20 + .sr1f = 0x1E }, [BT_PICCOLO] = { .name = "CL Piccolo", @@ -482,6 +482,7 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var, if (var->bits_per_pixel == 8) { switch (cinfo->btype) { case BT_ALPINE: + case BT_SD64: case BT_PICASSO4: if (freq > 85500) cinfo->multiplexing = 1; @@ -492,10 +493,7 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var, break; default: - dev_err(info->device, - "Frequency greater than maxclock (%ld kHz)\n", - maxclock); - return -EINVAL; + break; } } #if 0 @@ -847,7 +845,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* formula: VClk = (OSC * N) / (D * (1+P)) */ /* Example: VClk = (14.31818 * 91) / (23 * (1+1)) = 28.325 MHz */ - if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_PICASSO4) { + if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_PICASSO4 || + cinfo->btype == BT_SD64) { /* if freq is close to mclk or mclk/2 select mclk * as clock source */ @@ -966,30 +965,19 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* Extended Sequencer Mode */ switch (cinfo->btype) { - case BT_SD64: - /* setting the SEQRF on SD64 is not necessary - * (only during init) - */ - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x1a); - break; case BT_PICCOLO: case BT_SPECTRUM: - /* ### ueberall 0x22? */ - /* ##vorher 1c MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x22); /* evtl d0 bei 1 bit? avoid FIFO underruns..? */ vga_wseq(regbase, CL_SEQRF, 0xb0); break; case BT_PICASSO: - /* ##vorher 22 MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x22); /* ## vorher d0 avoid FIFO underruns..? */ vga_wseq(regbase, CL_SEQRF, 0xd0); break; + case BT_SD64: case BT_PICASSO4: case BT_ALPINE: case BT_GD5480: @@ -1051,16 +1039,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info) } switch (cinfo->btype) { - case BT_SD64: - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x1d); - break; - case BT_PICCOLO: case BT_PICASSO: case BT_SPECTRUM: - /* ### vorher 1c MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x22); /* Fast Page-Mode writes */ vga_wseq(regbase, CL_SEQRF, 0xb0); break; @@ -1074,6 +1055,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* We already set SRF and SR1F */ break; + case BT_SD64: case BT_GD5480: case BT_LAGUNA: case BT_LAGUNAB: @@ -1104,32 +1086,23 @@ static int cirrusfb_set_par_foo(struct fb_info *info) else if (var->bits_per_pixel == 16) { dev_dbg(info->device, "preparing for 16 bit deep display\n"); switch (cinfo->btype) { - case BT_SD64: - /* Extended Sequencer Mode: 256c col. mode */ - vga_wseq(regbase, CL_SEQR7, 0xf7); - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x1e); - break; - case BT_PICCOLO: case BT_SPECTRUM: vga_wseq(regbase, CL_SEQR7, 0x87); /* Fast Page-Mode writes */ vga_wseq(regbase, CL_SEQRF, 0xb0); - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x22); break; case BT_PICASSO: vga_wseq(regbase, CL_SEQR7, 0x27); /* Fast Page-Mode writes */ vga_wseq(regbase, CL_SEQRF, 0xb0); - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x22); break; + case BT_SD64: case BT_PICASSO4: case BT_ALPINE: + /* Extended Sequencer Mode: 256c col. mode */ vga_wseq(regbase, CL_SEQR7, 0xa7); break; @@ -1171,32 +1144,23 @@ static int cirrusfb_set_par_foo(struct fb_info *info) else if (var->bits_per_pixel == 24) { dev_dbg(info->device, "preparing for 24 bit deep display\n"); switch (cinfo->btype) { - case BT_SD64: - /* Extended Sequencer Mode: 256c col. mode */ - vga_wseq(regbase, CL_SEQR7, 0xf5); - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x1e); - break; - case BT_PICCOLO: case BT_SPECTRUM: vga_wseq(regbase, CL_SEQR7, 0x85); /* Fast Page-Mode writes */ vga_wseq(regbase, CL_SEQRF, 0xb0); - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x22); break; case BT_PICASSO: vga_wseq(regbase, CL_SEQR7, 0x25); /* Fast Page-Mode writes */ vga_wseq(regbase, CL_SEQRF, 0xb0); - /* MCLK select */ - vga_wseq(regbase, CL_SEQR1F, 0x22); break; + case BT_SD64: case BT_PICASSO4: case BT_ALPINE: + /* Extended Sequencer Mode: 256c col. mode */ vga_wseq(regbase, CL_SEQR7, 0xa5); break; @@ -1353,9 +1317,6 @@ static int cirrusfb_pan_display(struct fb_var_screeninfo *var, unsigned char tmp, xpix; struct cirrusfb_info *cinfo = info->par; - dev_dbg(info->device, - "virtual offset: (%d,%d)\n", var->xoffset, var->yoffset); - /* no range checks for xoffset and yoffset, */ /* as fb_pan_display has already done this */ if (var->vmode & FB_VMODE_YWRAP) @@ -1607,10 +1568,6 @@ static void init_vgachip(struct fb_info *info) vga_wseq(cinfo->regbase, CL_SEQR18, 0x02); } - /* MCLK select etc. */ - if (bi->init_sr1f) - vga_wseq(cinfo->regbase, CL_SEQR1F, bi->sr1f); - /* Screen A preset row scan: none */ vga_wcrt(cinfo->regbase, VGA_CRTC_PRESET_ROW, 0x00); /* Text cursor start: disable text cursor */ @@ -2346,6 +2303,11 @@ static int __devinit cirrusfb_zorro_register(struct zorro_dev *z, zorro_set_drvdata(z, info); + /* MCLK select etc. */ + if (cirrusfb_board_info[btype].init_sr1f) + vga_wseq(cinfo->regbase, CL_SEQR1F, + cirrusfb_board_info[btype].sr1f); + ret = cirrusfb_register(info); if (!ret) return 0; From df3aafd57d590d6f3d95310fc3430f3a536d1e59 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:16 -0700 Subject: [PATCH 435/478] cirrusfb: GD5434 (aka SD64) support fixed Fix handling of the Cirrus Logic GD5434 chip. Distinguish this chip from the GD5430. It allows detecting memory size for both models correctly. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 72 +++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 620327436082..a364e1b0dcb7 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -145,7 +145,9 @@ static const struct cirrusfb_board_info_rec { .scrn_start_bit19 = true, .sr07 = 0xF0, .sr07_1bpp = 0xF0, + .sr07_1bpp_mux = 0xF6, .sr07_8bpp = 0xF1, + .sr07_8bpp_mux = 0xF7, .sr1f = 0x1E }, [BT_PICCOLO] = { @@ -262,8 +264,8 @@ static const struct cirrusfb_board_info_rec { static struct pci_device_id cirrusfb_pci_table[] = { CHIP(PCI_DEVICE_ID_CIRRUS_5436, BT_ALPINE), - CHIP(PCI_DEVICE_ID_CIRRUS_5434_8, BT_ALPINE), - CHIP(PCI_DEVICE_ID_CIRRUS_5434_4, BT_ALPINE), + CHIP(PCI_DEVICE_ID_CIRRUS_5434_8, BT_SD64), + CHIP(PCI_DEVICE_ID_CIRRUS_5434_4, BT_SD64), CHIP(PCI_DEVICE_ID_CIRRUS_5430, BT_ALPINE), /* GD-5440 is same id */ CHIP(PCI_DEVICE_ID_CIRRUS_7543, BT_ALPINE), CHIP(PCI_DEVICE_ID_CIRRUS_7548, BT_ALPINE), @@ -341,6 +343,7 @@ struct cirrusfb_info { unsigned char SFR; /* Shadow of special function register */ int multiplexing; + int doubleVCLK; int blank_mode; u32 pseudo_palette[16]; @@ -496,18 +499,15 @@ static int cirrusfb_check_pixclock(const struct fb_var_screeninfo *var, break; } } -#if 0 - /* TODO: If we have a 1MB 5434, we need to put ourselves in a mode where + + /* If we have a 1MB 5434, we need to put ourselves in a mode where * the VCLK is double the pixel clock. */ - switch (var->bits_per_pixel) { - case 16: - case 24: - if (var->xres <= 800) - /* Xbh has this type of clock for 32-bit */ - freq /= 2; - break; + cinfo->doubleVCLK = 0; + if (cinfo->btype == BT_SD64 && info->fix.smem_len <= MB_ && + var->bits_per_pixel == 16) { + cinfo->doubleVCLK = 1; } -#endif + return 0; } @@ -830,10 +830,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wcrt(regbase, CL_CRT1A, tmp); freq = PICOS2KHZ(var->pixclock); - if (cinfo->btype == BT_ALPINE && var->bits_per_pixel == 24) - freq *= 3; + if (var->bits_per_pixel == 24) + if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_SD64) + freq *= 3; if (cinfo->multiplexing) freq /= 2; + if (cinfo->doubleVCLK) + freq *= 2; bestclock(freq, &nom, &den, &div); @@ -851,10 +854,9 @@ static int cirrusfb_set_par_foo(struct fb_info *info) * as clock source */ int divMCLK = cirrusfb_check_mclk(info, freq); - if (divMCLK) { + if (divMCLK) nom = 0; - cirrusfb_set_mclk_as_source(info, divMCLK); - } + cirrusfb_set_mclk_as_source(info, divMCLK); } if (is_laguna(cinfo)) { long pcifc = fb_readl(cinfo->laguna_mmio + 0x3fc); @@ -885,14 +887,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info) (cinfo->btype == BT_GD5480)) tmp |= 0x80; - dev_dbg(info->device, "CL_SEQR1B: %d\n", (int) tmp); /* Laguna chipset has reversed clock registers */ if (is_laguna(cinfo)) { vga_wseq(regbase, CL_SEQRE, tmp); vga_wseq(regbase, CL_SEQR1E, nom); } else { - vga_wseq(regbase, CL_SEQRB, nom); - vga_wseq(regbase, CL_SEQR1B, tmp); + vga_wseq(regbase, CL_SEQRE, nom); + vga_wseq(regbase, CL_SEQR1E, tmp); } } @@ -911,15 +912,13 @@ static int cirrusfb_set_par_foo(struct fb_info *info) else vga_wcrt(regbase, VGA_CRTC_REGS, 0x00); /* interlace control */ - /* adjust horizontal/vertical sync type (low/high) */ + /* adjust horizontal/vertical sync type (low/high), use VCLK3 */ /* enable display memory & CRTC I/O address for color mode */ - tmp = 0x03; + tmp = 0x03 | 0xc; if (var->sync & FB_SYNC_HOR_HIGH_ACT) tmp |= 0x40; if (var->sync & FB_SYNC_VERT_HIGH_ACT) tmp |= 0x80; - if (is_laguna(cinfo)) - tmp |= 0xc; WGen(cinfo, VGA_MIS_W, tmp); /* text cursor on and start line */ @@ -1052,9 +1051,6 @@ static int cirrusfb_set_par_foo(struct fb_info *info) vga_wseq(regbase, CL_SEQRF, 0xb8); #endif case BT_ALPINE: - /* We already set SRF and SR1F */ - break; - case BT_SD64: case BT_GD5480: case BT_LAGUNA: @@ -1103,7 +1099,8 @@ static int cirrusfb_set_par_foo(struct fb_info *info) case BT_PICASSO4: case BT_ALPINE: /* Extended Sequencer Mode: 256c col. mode */ - vga_wseq(regbase, CL_SEQR7, 0xa7); + vga_wseq(regbase, CL_SEQR7, + cinfo->doubleVCLK ? 0xa3 : 0xa7); break; case BT_GD5480: @@ -1128,7 +1125,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) /* mode register: 256 color mode */ vga_wgfx(regbase, VGA_GFX_MODE, 64); #ifdef CONFIG_PCI - WHDR(cinfo, 0xc1); /* Copy Xbh */ + WHDR(cinfo, cinfo->doubleVCLK ? 0xe1 : 0xc1); #elif defined(CONFIG_ZORRO) /* FIXME: CONFIG_PCI and CONFIG_ZORRO may be defined both */ WHDR(cinfo, 0xa0); /* hidden dac reg: nothing special */ @@ -1529,7 +1526,9 @@ static void init_vgachip(struct fb_info *info) case BT_LAGUNAB: break; case BT_SD64: +#ifdef CONFIG_ZORRO vga_wseq(cinfo->regbase, CL_SEQRF, 0xb8); +#endif break; default: vga_wseq(cinfo->regbase, CL_SEQR16, 0x0f); @@ -1604,7 +1603,8 @@ static void init_vgachip(struct fb_info *info) /* Bit Mask: no mask at all */ vga_wgfx(cinfo->regbase, VGA_GFX_BIT_MASK, 0xff); - if (cinfo->btype == BT_ALPINE || is_laguna(cinfo)) + if (cinfo->btype == BT_ALPINE || cinfo->btype == BT_SD64 || + is_laguna(cinfo)) /* (5434 can't have bit 3 set for bitblt) */ vga_wgfx(cinfo->regbase, CL_GRB, 0x20); else @@ -1809,9 +1809,11 @@ static void cirrusfb_imageblit(struct fb_info *info, if (info->state != FBINFO_STATE_RUNNING) return; - /* Alpine acceleration does not work at 24bpp ?!? */ - if (info->flags & FBINFO_HWACCEL_DISABLED || image->depth != 1 || - (cinfo->btype == BT_ALPINE && op == 0xc)) + /* Alpine/SD64 does not work at 24bpp ??? */ + if (info->flags & FBINFO_HWACCEL_DISABLED || image->depth != 1) + cfb_imageblit(info, image); + else if ((cinfo->btype == BT_ALPINE || cinfo->btype == BT_SD64) && + op == 0xc) cfb_imageblit(info, image); else { unsigned size = ((image->width + 7) >> 3) * image->height; @@ -1895,7 +1897,7 @@ static unsigned int __devinit cirrusfb_get_memsize(struct fb_info *info, /* If DRAM bank switching is enabled, there must be * twice as much memory installed. (4MB on the 5434) */ - if (SRF & 0x80) + if (cinfo->btype != BT_ALPINE && (SRF & 0x80) != 0) mem *= 2; } @@ -2553,7 +2555,7 @@ static void WClut(struct cirrusfb_info *cinfo, unsigned char regnum, unsigned ch if (cinfo->btype == BT_PICASSO || cinfo->btype == BT_PICASSO4 || cinfo->btype == BT_ALPINE || cinfo->btype == BT_GD5480 || - is_laguna(cinfo)) { + cinfo->btype == BT_SD64 || is_laguna(cinfo)) { /* but DAC data register IS, at least for Picasso II */ if (cinfo->btype == BT_PICASSO) data += 0xfff; From 4242a23c9e6b8e2462bb49bf78b76bfdf32158b5 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:17 -0700 Subject: [PATCH 436/478] cirrusfb: fix threshold register mask for Laguna chips Fix threshold register mask for Laguna chips otherwise some 8bpp modes are garbled after selecting a 24bpp mode. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index a364e1b0dcb7..9bb811d56721 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -875,7 +875,7 @@ static int cirrusfb_set_par_foo(struct fb_info *info) threshold = fb_readw(cinfo->laguna_mmio + 0xea); control &= ~0x6800; format = 0; - threshold &= 0xffe0 & 0x3fbf; + threshold &= 0xffc0 & 0x3fbf; } if (nom) { tmp = den << 1; From 8636a9240cc93efa6b36f4cfe6253e0574f832c6 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:17 -0700 Subject: [PATCH 437/478] cirrusfb: fix interlaced modes Fix calculations of timings for interlaced modes. Signed-off-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/cirrusfb.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 9bb811d56721..d42e385f091c 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -701,45 +701,52 @@ static int cirrusfb_set_par_foo(struct fb_info *info) hsyncstart = var->xres + var->right_margin; hsyncend = hsyncstart + var->hsync_len; - htotal = (hsyncend + var->left_margin) / 8 - 5; - hdispend = var->xres / 8 - 1; - hsyncstart = hsyncstart / 8 + 1; - hsyncend = hsyncend / 8 + 1; + htotal = (hsyncend + var->left_margin) / 8; + hdispend = var->xres / 8; + hsyncstart = hsyncstart / 8; + hsyncend = hsyncend / 8; - yres = var->yres; - vsyncstart = yres + var->lower_margin; + vdispend = var->yres; + vsyncstart = vdispend + var->lower_margin; vsyncend = vsyncstart + var->vsync_len; vtotal = vsyncend + var->upper_margin; - vdispend = yres - 1; if (var->vmode & FB_VMODE_DOUBLE) { - yres *= 2; + vdispend *= 2; vsyncstart *= 2; vsyncend *= 2; vtotal *= 2; } else if (var->vmode & FB_VMODE_INTERLACED) { - yres = (yres + 1) / 2; + vdispend = (vdispend + 1) / 2; vsyncstart = (vsyncstart + 1) / 2; vsyncend = (vsyncend + 1) / 2; vtotal = (vtotal + 1) / 2; } - - vtotal -= 2; - vsyncstart -= 1; - vsyncend -= 1; - + yres = vdispend; if (yres >= 1024) { vtotal /= 2; vsyncstart /= 2; vsyncend /= 2; vdispend /= 2; } + + vdispend -= 1; + vsyncstart -= 1; + vsyncend -= 1; + vtotal -= 2; + if (cinfo->multiplexing) { htotal /= 2; hsyncstart /= 2; hsyncend /= 2; hdispend /= 2; } + + htotal -= 5; + hdispend -= 1; + hsyncstart += 1; + hsyncend += 1; + /* unlock register VGA_CRTC_H_TOTAL..CRT7 */ vga_wcrt(regbase, VGA_CRTC_V_SYNC_END, 0x20); /* previously: 0x00) */ From 66c1ca019078220dc1bf968f2bb18421100ef147 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 31 Mar 2009 15:25:18 -0700 Subject: [PATCH 438/478] fbmem: fix fb_info->lock and mm->mmap_sem circular locking dependency Fix a circular locking dependency in the frame buffer console driver pushing down the mutex fb_info->lock. Circular locking dependecies occur calling the blocking fb_notifier_call_chain() with fb_info->lock held. Notifier callbacks can try to acquire mm->mmap_sem, while fb_mmap() acquires the locks in the reverse order mm->mmap_sem => fb_info->lock. Tested-by: Andrey Borzenkov Signed-off-by: Andrea Righi Cc: Geert Uytterhoeven Cc: Krzysztof Helt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/backlight.c | 3 ++ drivers/video/backlight/lcd.c | 3 ++ drivers/video/console/fbcon.c | 73 +++++++++++++++++++++++++---- drivers/video/fbmem.c | 11 +---- 4 files changed, 70 insertions(+), 20 deletions(-) diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 157057c79ca3..dd37cbcaf8ce 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -35,6 +35,8 @@ static int fb_notifier_callback(struct notifier_block *self, return 0; bd = container_of(self, struct backlight_device, fb_notif); + if (!lock_fb_info(evdata->info)) + return -ENODEV; mutex_lock(&bd->ops_lock); if (bd->ops) if (!bd->ops->check_fb || @@ -47,6 +49,7 @@ static int fb_notifier_callback(struct notifier_block *self, backlight_update_status(bd); } mutex_unlock(&bd->ops_lock); + unlock_fb_info(evdata->info); return 0; } diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c index b6449470106c..0bb13df0fa89 100644 --- a/drivers/video/backlight/lcd.c +++ b/drivers/video/backlight/lcd.c @@ -40,6 +40,8 @@ static int fb_notifier_callback(struct notifier_block *self, if (!ld->ops) return 0; + if (!lock_fb_info(evdata->info)) + return -ENODEV; mutex_lock(&ld->ops_lock); if (!ld->ops->check_fb || ld->ops->check_fb(ld, evdata->info)) { if (event == FB_EVENT_BLANK) { @@ -51,6 +53,7 @@ static int fb_notifier_callback(struct notifier_block *self, } } mutex_unlock(&ld->ops_lock); + unlock_fb_info(evdata->info); return 0; } diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 1657b9608b04..2cd500a304f2 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -2954,8 +2954,11 @@ static int fbcon_fb_unbind(int idx) static int fbcon_fb_unregistered(struct fb_info *info) { - int i, idx = info->node; + int i, idx; + if (!lock_fb_info(info)) + return -ENODEV; + idx = info->node; for (i = first_fb_vc; i <= last_fb_vc; i++) { if (con2fb_map[i] == idx) con2fb_map[i] = -1; @@ -2979,13 +2982,14 @@ static int fbcon_fb_unregistered(struct fb_info *info) } } - if (!num_registered_fb) - unregister_con_driver(&fb_con); - - if (primary_device == idx) primary_device = -1; + unlock_fb_info(info); + + if (!num_registered_fb) + unregister_con_driver(&fb_con); + return 0; } @@ -3021,9 +3025,13 @@ static inline void fbcon_select_primary(struct fb_info *info) static int fbcon_fb_registered(struct fb_info *info) { - int ret = 0, i, idx = info->node; + int ret = 0, i, idx; + if (!lock_fb_info(info)) + return -ENODEV; + idx = info->node; fbcon_select_primary(info); + unlock_fb_info(info); if (info_idx == -1) { for (i = first_fb_vc; i <= last_fb_vc; i++) { @@ -3124,7 +3132,7 @@ static void fbcon_get_requirement(struct fb_info *info, } } -static int fbcon_event_notify(struct notifier_block *self, +static int fbcon_event_notify(struct notifier_block *self, unsigned long action, void *data) { struct fb_event *event = data; @@ -3132,7 +3140,7 @@ static int fbcon_event_notify(struct notifier_block *self, struct fb_videomode *mode; struct fb_con2fbmap *con2fb; struct fb_blit_caps *caps; - int ret = 0; + int idx, ret = 0; /* * ignore all events except driver registration and deregistration @@ -3144,23 +3152,54 @@ static int fbcon_event_notify(struct notifier_block *self, switch(action) { case FB_EVENT_SUSPEND: + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } fbcon_suspended(info); + unlock_fb_info(info); break; case FB_EVENT_RESUME: + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } fbcon_resumed(info); + unlock_fb_info(info); break; case FB_EVENT_MODE_CHANGE: + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } fbcon_modechanged(info); + unlock_fb_info(info); break; case FB_EVENT_MODE_CHANGE_ALL: + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } fbcon_set_all_vcs(info); + unlock_fb_info(info); break; case FB_EVENT_MODE_DELETE: mode = event->data; + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } ret = fbcon_mode_deleted(info, mode); + unlock_fb_info(info); break; case FB_EVENT_FB_UNBIND: - ret = fbcon_fb_unbind(info->node); + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } + idx = info->node; + unlock_fb_info(info); + ret = fbcon_fb_unbind(idx); break; case FB_EVENT_FB_REGISTERED: ret = fbcon_fb_registered(info); @@ -3178,17 +3217,31 @@ static int fbcon_event_notify(struct notifier_block *self, con2fb->framebuffer = con2fb_map[con2fb->console - 1]; break; case FB_EVENT_BLANK: + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } fbcon_fb_blanked(info, *(int *)event->data); + unlock_fb_info(info); break; case FB_EVENT_NEW_MODELIST: + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } fbcon_new_modelist(info); + unlock_fb_info(info); break; case FB_EVENT_GET_REQ: caps = event->data; + if (!lock_fb_info(info)) { + ret = -ENODEV; + goto done; + } fbcon_get_requirement(info, caps); + unlock_fb_info(info); break; } - done: return ret; } diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index cfd9dce1ce0b..b64f061dd447 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1086,13 +1086,8 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EINVAL; con2fb.framebuffer = -1; event.data = &con2fb; - - if (!lock_fb_info(info)) - return -ENODEV; event.info = info; fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event); - unlock_fb_info(info); - ret = copy_to_user(argp, &con2fb, sizeof(con2fb)) ? -EFAULT : 0; break; case FBIOPUT_CON2FBMAP: @@ -1109,12 +1104,8 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, break; } event.data = &con2fb; - if (!lock_fb_info(info)) - return -ENODEV; event.info = info; - ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, - &event); - unlock_fb_info(info); + ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, &event); break; case FBIOBLANK: if (!lock_fb_info(info)) From 6a7f2829b5f8be124e168265f176dbbbea8861a0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Mar 2009 15:25:19 -0700 Subject: [PATCH 439/478] fbdev: uninline lock_fb_info() Before: text data bss dec hex filename 3648 2910 32 6590 19be drivers/video/backlight/backlight.o 3226 2812 32 6070 17b6 drivers/video/backlight/lcd.o 30990 16688 8480 56158 db5e drivers/video/console/fbcon.o 15488 8400 24 23912 5d68 drivers/video/fbmem.o After: text data bss dec hex filename 3537 2870 32 6439 1927 drivers/video/backlight/backlight.o 3131 2772 32 5935 172f drivers/video/backlight/lcd.o 30876 16648 8480 56004 dac4 drivers/video/console/fbcon.o 15506 8400 24 23930 5d7a drivers/video/fbmem.o Cc: Andrea Righi Cc: Geert Uytterhoeven Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/fbmem.c | 11 +++++++++++ include/linux/fb.h | 10 +--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index b64f061dd447..2ac32e6b5953 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -46,6 +46,17 @@ struct fb_info *registered_fb[FB_MAX] __read_mostly; int num_registered_fb __read_mostly; +int lock_fb_info(struct fb_info *info) +{ + mutex_lock(&info->lock); + if (!info->fbops) { + mutex_unlock(&info->lock); + return 0; + } + return 1; +} +EXPORT_SYMBOL(lock_fb_info); + /* * Helpers */ diff --git a/include/linux/fb.h b/include/linux/fb.h index fe7d0d7907ab..f563c5013932 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -961,15 +961,7 @@ extern struct fb_info *registered_fb[FB_MAX]; extern int num_registered_fb; extern struct class *fb_class; -static inline int lock_fb_info(struct fb_info *info) -{ - mutex_lock(&info->lock); - if (!info->fbops) { - mutex_unlock(&info->lock); - return 0; - } - return 1; -} +extern int lock_fb_info(struct fb_info *info); static inline void unlock_fb_info(struct fb_info *info) { From d4bc4e8af0a4a34c713f8c1a33a78cedffe8e0b7 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:20 -0700 Subject: [PATCH 440/478] drivers/video/sgivwfb.c: fix memory leaks in removal path We were leaking both the cmap memory and the info struct memory. Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/sgivwfb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/sgivwfb.c b/drivers/video/sgivwfb.c index f5252c2552fd..bba53714a7b1 100644 --- a/drivers/video/sgivwfb.c +++ b/drivers/video/sgivwfb.c @@ -837,6 +837,8 @@ static int sgivwfb_remove(struct platform_device *dev) iounmap(par->regs); iounmap(info->screen_base); release_mem_region(DBE_REG_PHYS, DBE_REG_SIZE); + fb_dealloc_cmap(&info->cmap); + framebuffer_release(info); } return 0; } From 895d72279da7f24f266f9583c239e7b22230127c Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:21 -0700 Subject: [PATCH 441/478] tdfxfb: fix memory leaks in removal path We were leaking the cmap memory. Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/tdfxfb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/tdfxfb.c b/drivers/video/tdfxfb.c index 14bd3f3680b8..ee64771fbe3d 100644 --- a/drivers/video/tdfxfb.c +++ b/drivers/video/tdfxfb.c @@ -1393,6 +1393,7 @@ static void __devexit tdfxfb_remove(struct pci_dev *pdev) release_mem_region(pci_resource_start(pdev, 0), pci_resource_len(pdev, 0)); pci_set_drvdata(pdev, NULL); + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); } From 07b39b49b402355a7172c113102a8b68aafb17dd Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:22 -0700 Subject: [PATCH 442/478] tridentfb: fix memory leaks in removal path We were leaking the cmap memory. Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/tridentfb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/tridentfb.c b/drivers/video/tridentfb.c index 479b2e79ad68..c5369ba9485a 100644 --- a/drivers/video/tridentfb.c +++ b/drivers/video/tridentfb.c @@ -1563,6 +1563,7 @@ static void __devexit trident_pci_remove(struct pci_dev *dev) release_mem_region(tridentfb_fix.mmio_start, tridentfb_fix.mmio_len); pci_set_drvdata(dev, NULL); kfree(info->pixmap.addr); + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); } From 5e266e2e0e19532c1b8e2e2bff1eb6ccf42e478a Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:22 -0700 Subject: [PATCH 443/478] vfb: fix memory leaks in removal path We were leaking the cmap memory. Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/vfb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c index 93fe08d6c78f..cc919ae46571 100644 --- a/drivers/video/vfb.c +++ b/drivers/video/vfb.c @@ -543,6 +543,7 @@ static int vfb_remove(struct platform_device *dev) if (info) { unregister_framebuffer(info); rvfree(videomemory, videomemorysize); + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); } return 0; From 0fd853118dd821de59106c5b9a0a2a6f488bc4b5 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:23 -0700 Subject: [PATCH 444/478] skeletonfb: check fb_alloc_cmap return value and handle failure properly Bad example code, no cookie! Signed-off-by: Andres Salomon Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/skeletonfb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/video/skeletonfb.c b/drivers/video/skeletonfb.c index df5336561d13..a439159204a8 100644 --- a/drivers/video/skeletonfb.c +++ b/drivers/video/skeletonfb.c @@ -795,8 +795,9 @@ static int __devinit xxxfb_probe(struct pci_dev *dev, if (!retval || retval == 4) return -EINVAL; - /* This has to been done !!! */ - fb_alloc_cmap(&info->cmap, cmap_len, 0); + /* This has to be done! */ + if (fb_alloc_cmap(&info->cmap, cmap_len, 0)) + return -ENOMEM; /* * The following is done in the case of having hardware with a static @@ -820,8 +821,10 @@ static int __devinit xxxfb_probe(struct pci_dev *dev, */ /* xxxfb_set_par(info); */ - if (register_framebuffer(info) < 0) + if (register_framebuffer(info) < 0) { + fb_dealloc_cmap(&info->cmap); return -EINVAL; + } printk(KERN_INFO "fb%d: %s frame buffer device\n", info->node, info->fix.id); pci_set_drvdata(dev, info); /* or platform_set_drvdata(pdev, info) */ From 0a5d924e5954e81a905907512f8c7a1cbf81d700 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:24 -0700 Subject: [PATCH 445/478] sm501fb: check fb_alloc_cmap return value and handle failure properly Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Cc: Ben Dooks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/sm501fb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/video/sm501fb.c b/drivers/video/sm501fb.c index dcd98793d568..eb5d73a06702 100644 --- a/drivers/video/sm501fb.c +++ b/drivers/video/sm501fb.c @@ -1525,7 +1525,10 @@ static int sm501fb_init_fb(struct fb_info *fb, } /* initialise and set the palette */ - fb_alloc_cmap(&fb->cmap, NR_PALETTE, 0); + if (fb_alloc_cmap(&fb->cmap, NR_PALETTE, 0)) { + dev_err(info->dev, "failed to allocate cmap memory\n"); + return -ENOMEM; + } fb_set_cmap(&fb->cmap, fb); ret = (fb->fbops->fb_check_var)(&fb->var, fb); From c23124277e58998703278c26c53b159cea0f9643 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:25 -0700 Subject: [PATCH 446/478] sstfb: check fb_alloc_cmap return value and handle failure properly Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/sstfb.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/video/sstfb.c b/drivers/video/sstfb.c index 5b11a00f49bc..609d0a521ca2 100644 --- a/drivers/video/sstfb.c +++ b/drivers/video/sstfb.c @@ -1421,13 +1421,16 @@ static int __devinit sstfb_probe(struct pci_dev *pdev, goto fail; } - fb_alloc_cmap(&info->cmap, 256, 0); + if (fb_alloc_cmap(&info->cmap, 256, 0)) { + printk(KERN_ERR "sstfb: can't alloc cmap memory.\n"); + goto fail; + } /* register fb */ info->device = &pdev->dev; if (register_framebuffer(info) < 0) { printk(KERN_ERR "sstfb: can't register framebuffer.\n"); - goto fail; + goto fail_register; } sstfb_clear_screen(info); @@ -1441,8 +1444,9 @@ static int __devinit sstfb_probe(struct pci_dev *pdev, return 0; -fail: +fail_register: fb_dealloc_cmap(&info->cmap); +fail: iounmap(info->screen_base); fail_fb_remap: iounmap(par->mmio_vbase); From 175b39fb7e145e1aa06f6369c1fbea16873dee9e Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:26 -0700 Subject: [PATCH 447/478] stifb: check fb_alloc_cmap return value and handle failure properly Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/stifb.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/video/stifb.c b/drivers/video/stifb.c index 166481402412..eabaad765aeb 100644 --- a/drivers/video/stifb.c +++ b/drivers/video/stifb.c @@ -1262,24 +1262,25 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) info->flags = FBINFO_DEFAULT; info->pseudo_palette = &fb->pseudo_palette; - /* This has to been done !!! */ - fb_alloc_cmap(&info->cmap, NR_PALETTE, 0); + /* This has to be done !!! */ + if (fb_alloc_cmap(&info->cmap, NR_PALETTE, 0)) + goto out_err1; stifb_init_display(fb); if (!request_mem_region(fix->smem_start, fix->smem_len, "stifb fb")) { printk(KERN_ERR "stifb: cannot reserve fb region 0x%04lx-0x%04lx\n", fix->smem_start, fix->smem_start+fix->smem_len); - goto out_err1; + goto out_err2; } if (!request_mem_region(fix->mmio_start, fix->mmio_len, "stifb mmio")) { printk(KERN_ERR "stifb: cannot reserve sti mmio region 0x%04lx-0x%04lx\n", fix->mmio_start, fix->mmio_start+fix->mmio_len); - goto out_err2; + goto out_err3; } if (register_framebuffer(&fb->info) < 0) - goto out_err3; + goto out_err4; sti->info = info; /* save for unregister_framebuffer() */ @@ -1297,13 +1298,14 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) return 0; -out_err3: +out_err4: release_mem_region(fix->mmio_start, fix->mmio_len); -out_err2: +out_err3: release_mem_region(fix->smem_start, fix->smem_len); +out_err2: + fb_dealloc_cmap(&info->cmap); out_err1: iounmap(info->screen_base); - fb_dealloc_cmap(&info->cmap); out_err0: kfree(fb); return -ENXIO; From ccb121e6958eca5f58938e56523fc589fed36fa8 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:26 -0700 Subject: [PATCH 448/478] valkyriefb: check fb_alloc_cmap return value and handle failure properly Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/valkyriefb.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/video/valkyriefb.c b/drivers/video/valkyriefb.c index 7b0cef9ca8f9..4bb9a0b18950 100644 --- a/drivers/video/valkyriefb.c +++ b/drivers/video/valkyriefb.c @@ -119,7 +119,7 @@ static void set_valkyrie_clock(unsigned char *params); static int valkyrie_var_to_par(struct fb_var_screeninfo *var, struct fb_par_valkyrie *par, const struct fb_info *fb_info); -static void valkyrie_init_info(struct fb_info *info, struct fb_info_valkyrie *p); +static int valkyrie_init_info(struct fb_info *info, struct fb_info_valkyrie *p); static void valkyrie_par_to_fix(struct fb_par_valkyrie *par, struct fb_fix_screeninfo *fix); static void valkyrie_init_fix(struct fb_fix_screeninfo *fix, struct fb_info_valkyrie *p); @@ -381,18 +381,22 @@ int __init valkyriefb_init(void) valkyrie_choose_mode(p); mac_vmode_to_var(default_vmode, default_cmode, &p->info.var); - valkyrie_init_info(&p->info, p); + err = valkyrie_init_info(&p->info, p); + if (err < 0) + goto out_free; valkyrie_init_fix(&p->info.fix, p); if (valkyriefb_set_par(&p->info)) /* "can't happen" */ printk(KERN_ERR "valkyriefb: can't set default video mode\n"); if ((err = register_framebuffer(&p->info)) != 0) - goto out_free; + goto out_cmap_free; printk(KERN_INFO "fb%d: valkyrie frame buffer device\n", p->info.node); return 0; + out_cmap_free: + fb_dealloc_cmap(&p->info.cmap); out_free: if (p->frame_buffer) iounmap(p->frame_buffer); @@ -538,14 +542,15 @@ static void valkyrie_par_to_fix(struct fb_par_valkyrie *par, /* ywrapstep, xpanstep, ypanstep */ } -static void __init valkyrie_init_info(struct fb_info *info, struct fb_info_valkyrie *p) +static int __init valkyrie_init_info(struct fb_info *info, + struct fb_info_valkyrie *p) { info->fbops = &valkyriefb_ops; info->screen_base = p->frame_buffer + 0x1000; info->flags = FBINFO_DEFAULT; info->pseudo_palette = p->pseudo_palette; - fb_alloc_cmap(&info->cmap, 256, 0); info->par = &p->par; + return fb_alloc_cmap(&info->cmap, 256, 0); } From cc880a715782fe31116284d90e0b5bfb1411535b Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:27 -0700 Subject: [PATCH 449/478] sunxvr500: fix cmap memory leaks - fix cmap leak in removal path - fix cmap leak when register_framebuffer fails Signed-off-by: Andres Salomon Cc: "David S. Miller" Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/sunxvr500.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/video/sunxvr500.c b/drivers/video/sunxvr500.c index c2ba51b7ea18..18b950706cad 100644 --- a/drivers/video/sunxvr500.c +++ b/drivers/video/sunxvr500.c @@ -349,11 +349,14 @@ static int __devinit e3d_pci_register(struct pci_dev *pdev, if (err < 0) { printk(KERN_ERR "e3d: Could not register framebuffer %s\n", pci_name(pdev)); - goto err_unmap_fb; + goto err_free_cmap; } return 0; +err_free_cmap: + fb_dealloc_cmap(&info->cmap); + err_unmap_fb: iounmap(ep->fb_base); @@ -389,6 +392,7 @@ static void __devexit e3d_pci_unregister(struct pci_dev *pdev) pci_release_region(pdev, 0); pci_release_region(pdev, 1); + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); pci_disable_device(pdev); From 327fc8752a3c08fc7dc7d382883e65aad2f03bde Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:28 -0700 Subject: [PATCH 450/478] tgafb: fix cmap memory leak Fix cmap leak when register_framebuffer fails. Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/tgafb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/video/tgafb.c b/drivers/video/tgafb.c index 680642c089c9..a86046ff60ad 100644 --- a/drivers/video/tgafb.c +++ b/drivers/video/tgafb.c @@ -1663,7 +1663,7 @@ tgafb_register(struct device *dev) if (register_framebuffer(info) < 0) { printk(KERN_ERR "tgafb: Could not register framebuffer\n"); ret = -EINVAL; - goto err1; + goto err2; } if (tga_bus_pci) { @@ -1682,6 +1682,8 @@ tgafb_register(struct device *dev) return 0; + err2: + fb_dealloc_cmap(&info->cmap); err1: if (mem_base) iounmap(mem_base); From e98d9b407c248ba1419bed0823488d3cc71a2c31 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:28 -0700 Subject: [PATCH 451/478] 68328fb: fix cmap memory leaks - fix cmap leak in removal path - fix cmap leak when register_framebuffer fails - check return value of fb_alloc_cmap Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/68328fb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c index 7f907fb23b8a..0b17824b0eb5 100644 --- a/drivers/video/68328fb.c +++ b/drivers/video/68328fb.c @@ -471,9 +471,11 @@ int __init mc68x328fb_init(void) fb_info.pseudo_palette = &mc68x328fb_pseudo_palette; fb_info.flags = FBINFO_DEFAULT | FBINFO_HWACCEL_YPAN; - fb_alloc_cmap(&fb_info.cmap, 256, 0); + if (fb_alloc_cmap(&fb_info.cmap, 256, 0)) + return -ENOMEM; if (register_framebuffer(&fb_info) < 0) { + fb_dealloc_cmap(&fb_info.cmap); return -EINVAL; } @@ -494,6 +496,7 @@ module_init(mc68x328fb_init); static void __exit mc68x328fb_cleanup(void) { unregister_framebuffer(&fb_info); + fb_dealloc_cmap(&fb_info.cmap); } module_exit(mc68x328fb_cleanup); From 909baf0092545e5c2082b045303e7a4b1d2a0522 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:29 -0700 Subject: [PATCH 452/478] amba-clcd: fix cmap memory leaks - fix cmap leak in removal path - fix cmap leak when register_framebuffer fails Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/amba-clcd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c index 4e046fed1380..61050ab14128 100644 --- a/drivers/video/amba-clcd.c +++ b/drivers/video/amba-clcd.c @@ -408,7 +408,9 @@ static int clcdfb_register(struct clcd_fb *fb) /* * Allocate colourmap. */ - fb_alloc_cmap(&fb->fb.cmap, 256, 0); + ret = fb_alloc_cmap(&fb->fb.cmap, 256, 0); + if (ret) + goto unmap; /* * Ensure interrupts are disabled. @@ -426,6 +428,8 @@ static int clcdfb_register(struct clcd_fb *fb) printk(KERN_ERR "CLCD: cannot register framebuffer (%d)\n", ret); + fb_dealloc_cmap(&fb->fb.cmap); + unmap: iounmap(fb->regs); free_clk: clk_put(fb->clk); @@ -485,6 +489,8 @@ static int clcdfb_remove(struct amba_device *dev) clcdfb_disable(fb); unregister_framebuffer(&fb->fb); + if (fb->fb.cmap.len) + fb_dealloc_cmap(&fb->fb.cmap); iounmap(fb->regs); clk_put(fb->clk); From eb8972b4407f81b07ea6fc71fd91f9fc7a35a81e Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:30 -0700 Subject: [PATCH 453/478] amifb: check fb_alloc_cmap return value and handle failure properly Signed-off-by: Andres Salomon Acked-by: Krzysztof Helt Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/amifb.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/video/amifb.c b/drivers/video/amifb.c index 100f23661465..82bedd7f7789 100644 --- a/drivers/video/amifb.c +++ b/drivers/video/amifb.c @@ -2437,7 +2437,9 @@ default_chipset: goto amifb_error; } - fb_alloc_cmap(&fb_info.cmap, 1< Date: Tue, 31 Mar 2009 15:25:30 -0700 Subject: [PATCH 454/478] fbdev: newport: newport_*wait() return 0 on timeout With a postfix decrement t reaches -1 on timeout which results in a return of 0. Signed-off-by: Roel Kluin Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/video/newport.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/video/newport.h b/include/video/newport.h index 1f5ebeaa818f..001b935e71c4 100644 --- a/include/video/newport.h +++ b/include/video/newport.h @@ -453,7 +453,7 @@ static __inline__ int newport_wait(struct newport_regs *regs) { int t = BUSY_TIMEOUT; - while (t--) + while (--t) if (!(regs->cset.status & NPORT_STAT_GBUSY)) break; return !t; @@ -463,7 +463,7 @@ static __inline__ int newport_bfwait(struct newport_regs *regs) { int t = BUSY_TIMEOUT; - while (t--) + while (--t) if(!(regs->cset.status & NPORT_STAT_BBUSY)) break; return !t; From afbb9d8d5266b4121cb503b4e097f8e65286a077 Mon Sep 17 00:00:00 2001 From: Kristoffer Ericson Date: Tue, 31 Mar 2009 15:25:31 -0700 Subject: [PATCH 455/478] fbdev: update s1d13xxxfb to differ between revisions and production ids The s1d13xxx chip provides two values of identification value: the Production id (e.g 13506/13505/13806..) and a revision number 0,1,2,3). Together these can help us to differentiate between similiar setups. This patch adds the proper way of grabbing both those values and save them for future reference (in order to decide what functions a card supports, e.g acceleration). We also move away from the concept of all s1d13xxx = s1d13806 when we really support alot more. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: simplify s1d13xxxfb_probe()] Signed-off-by: Kristoffer Ericson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/s1d13xxxfb.c | 48 ++++++++++++++++++++++++++++---------- include/video/s1d13xxxfb.h | 16 ++++++++----- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/video/s1d13xxxfb.c b/drivers/video/s1d13xxxfb.c index a7b01d2724b5..0726aecf3b7e 100644 --- a/drivers/video/s1d13xxxfb.c +++ b/drivers/video/s1d13xxxfb.c @@ -50,9 +50,22 @@ #define dbg(fmt, args...) do { } while (0) #endif -static const int __devinitconst s1d13xxxfb_revisions[] = { - S1D13506_CHIP_REV, /* Rev.4 on HP Jornada 7xx S1D13506 */ - S1D13806_CHIP_REV, /* Rev.7 on .. */ +/* + * List of card production ids + */ +static const int s1d13xxxfb_prod_ids[] = { + S1D13505_PROD_ID, + S1D13506_PROD_ID, + S1D13806_PROD_ID, +}; + +/* + * List of card strings + */ +static const char *s1d13xxxfb_prod_names[] = { + "S1D13505", + "S1D13506", + "S1D13806", }; /* @@ -377,7 +390,6 @@ s1d13xxxfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info) return 0; } - /* framebuffer information structures */ static struct fb_ops s1d13xxxfb_fbops = { @@ -544,7 +556,7 @@ s1d13xxxfb_probe(struct platform_device *pdev) struct s1d13xxxfb_pdata *pdata = NULL; int ret = 0; int i; - u8 revision; + u8 revision, prod_id; dbg("probe called: device is %p\n", pdev); @@ -613,19 +625,31 @@ s1d13xxxfb_probe(struct platform_device *pdev) goto bail; } - revision = s1d13xxxfb_readreg(default_par, S1DREG_REV_CODE) >> 2; - + /* production id is top 6 bits */ + prod_id = s1d13xxxfb_readreg(default_par, S1DREG_REV_CODE) >> 2; + /* revision id is lower 2 bits */ + revision = s1d13xxxfb_readreg(default_par, S1DREG_REV_CODE) & 0x3; ret = -ENODEV; - for (i = 0; i < ARRAY_SIZE(s1d13xxxfb_revisions); i++) { - if (revision == s1d13xxxfb_revisions[i]) + for (i = 0; i < ARRAY_SIZE(s1d13xxxfb_prod_ids); i++) { + if (prod_id == s1d13xxxfb_prod_ids[i]) { + /* looks like we got it in our list */ + default_par->prod_id = prod_id; + default_par->revision = revision; ret = 0; + break; + } } - if (!ret) + if (!ret) { + printk(KERN_INFO PFX "chip production id %i = %s\n", + prod_id, s1d13xxxfb_prod_names[i]); printk(KERN_INFO PFX "chip revision %i\n", revision); - else { - printk(KERN_INFO PFX "unknown chip revision %i\n", revision); + } else { + printk(KERN_INFO PFX + "unknown chip production id %i, revision %i\n", + prod_id, revision); + printk(KERN_INFO PFX "please contant maintainer\n"); goto bail; } diff --git a/include/video/s1d13xxxfb.h b/include/video/s1d13xxxfb.h index fe41b8407946..c3b2a2aa7140 100644 --- a/include/video/s1d13xxxfb.h +++ b/include/video/s1d13xxxfb.h @@ -14,13 +14,16 @@ #define S1D13XXXFB_H #define S1D_PALETTE_SIZE 256 -#define S1D13506_CHIP_REV 4 /* expected chip revision number for s1d13506 */ -#define S1D13806_CHIP_REV 7 /* expected chip revision number for s1d13806 */ -#define S1D_FBID "S1D13806" -#define S1D_DEVICENAME "s1d13806fb" +#define S1D_FBID "S1D13xxx" +#define S1D_DEVICENAME "s1d13xxxfb" + +/* S1DREG_REV_CODE register = prod_id (6 bits) + revision (2 bits) */ +#define S1D13505_PROD_ID 0x3 /* 000011 */ +#define S1D13506_PROD_ID 0x4 /* 000100 */ +#define S1D13806_PROD_ID 0x7 /* 000111 */ /* register definitions (tested on s1d13896) */ -#define S1DREG_REV_CODE 0x0000 /* Revision Code Register */ +#define S1DREG_REV_CODE 0x0000 /* Prod + Rev Code Register */ #define S1DREG_MISC 0x0001 /* Miscellaneous Register */ #define S1DREG_GPIO_CNF0 0x0004 /* General IO Pins Configuration Register 0 */ #define S1DREG_GPIO_CNF1 0x0005 /* General IO Pins Configuration Register 1 */ @@ -141,10 +144,11 @@ struct s1d13xxxfb_regval { u8 value; }; - struct s1d13xxxfb_par { void __iomem *regs; unsigned char display; + unsigned char prod_id; + unsigned char revision; unsigned int pseudo_palette[16]; #ifdef CONFIG_PM From ba78289343226773b27dc25e7d1e739d0162b9e8 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:25:32 -0700 Subject: [PATCH 456/478] drivers/video/omap/hwa742.c: div reaches max_clk_div With for(div = 0; div < max_clk_div; div++) { ... } div reaches max_clk_div. Signed-off-by: Roel Kluin Cc: Joe Perches Acked-by: Trilok Soni Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/omap/hwa742.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/omap/hwa742.c b/drivers/video/omap/hwa742.c index f24df0b54e1c..8aa6e47202b9 100644 --- a/drivers/video/omap/hwa742.c +++ b/drivers/video/omap/hwa742.c @@ -742,7 +742,7 @@ static int calc_extif_timings(unsigned long sysclk, int *extif_mem_div) if (calc_reg_timing(sysclk, div) == 0) break; } - if (div > max_clk_div) + if (div >= max_clk_div) goto err; *extif_mem_div = div; @@ -752,7 +752,7 @@ static int calc_extif_timings(unsigned long sysclk, int *extif_mem_div) break; } - if (div > max_clk_div) + if (div >= max_clk_div) goto err; return 0; From 032220ba310204be9cb2ddbbf848020fadc63ce6 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 31 Mar 2009 15:25:33 -0700 Subject: [PATCH 457/478] asiliantfb: fix cmap memory leaks - fix cmap leak in removal path - fix cmap leak when register_framebuffer fails - check return value of fb_alloc_cmap - don't continue with driver setup if register_framebuffer fails [krzysztof.h1@wp.pl: spotted missing iounmap] [randy.dunlap@oracle.com: move data declaration before any code] Signed-off-by: Andres Salomon Cc: Krzysztof Helt Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/asiliantfb.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/video/asiliantfb.c b/drivers/video/asiliantfb.c index 1fd22f460b0f..1a1f946d8fef 100644 --- a/drivers/video/asiliantfb.c +++ b/drivers/video/asiliantfb.c @@ -505,19 +505,27 @@ static struct fb_var_screeninfo asiliantfb_var __devinitdata = { .vsync_len = 2, }; -static void __devinit init_asiliant(struct fb_info *p, unsigned long addr) +static int __devinit init_asiliant(struct fb_info *p, unsigned long addr) { + int err; + p->fix = asiliantfb_fix; p->fix.smem_start = addr; p->var = asiliantfb_var; p->fbops = &asiliantfb_ops; p->flags = FBINFO_DEFAULT; - fb_alloc_cmap(&p->cmap, 256, 0); + err = fb_alloc_cmap(&p->cmap, 256, 0); + if (err) { + printk(KERN_ERR "C&T 69000 fb failed to alloc cmap memory\n"); + return err; + } - if (register_framebuffer(p) < 0) { + err = register_framebuffer(p); + if (err < 0) { printk(KERN_ERR "C&T 69000 framebuffer failed to register\n"); - return; + fb_dealloc_cmap(&p->cmap); + return err; } printk(KERN_INFO "fb%d: Asiliant 69000 frame buffer (%dK RAM detected)\n", @@ -532,6 +540,7 @@ asiliantfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) { unsigned long addr, size; struct fb_info *p; + int err; if ((dp->resource[0].flags & IORESOURCE_MEM) == 0) return -ENODEV; @@ -560,7 +569,13 @@ asiliantfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) pci_write_config_dword(dp, 4, 0x02800083); writeb(3, p->screen_base + 0x400784); - init_asiliant(p, addr); + err = init_asiliant(p, addr); + if (err) { + iounmap(p->screen_base); + release_mem_region(addr, size); + framebuffer_release(p); + return err; + } pci_set_drvdata(dp, p); return 0; @@ -571,6 +586,7 @@ static void __devexit asiliantfb_remove(struct pci_dev *dp) struct fb_info *p = pci_get_drvdata(dp); unregister_framebuffer(p); + fb_dealloc_cmap(&p->cmap); iounmap(p->screen_base); release_mem_region(pci_resource_start(dp, 0), pci_resource_len(dp, 0)); pci_set_drvdata(dp, NULL); From b935257b1f98291ec1c8cbf7dbccbe0b20665bf6 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:25:34 -0700 Subject: [PATCH 458/478] arkfb: fix misplaced parentheses Signed-off-by: Roel Kluin Acked-by: Ondrej Zajicek Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/arkfb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c index 314d18694b6a..d583bea608fd 100644 --- a/drivers/video/arkfb.c +++ b/drivers/video/arkfb.c @@ -470,7 +470,7 @@ static void ark_dac_read_regs(void *data, u8 *code, int count) while (count != 0) { - vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0); + vga_wseq(NULL, 0x1C, regval | (code[0] & 4 ? 0x80 : 0)); code[1] = vga_r(NULL, dac_regs[code[0] & 3]); count--; code += 2; @@ -485,7 +485,7 @@ static void ark_dac_write_regs(void *data, u8 *code, int count) while (count != 0) { - vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0); + vga_wseq(NULL, 0x1C, regval | (code[0] & 4 ? 0x80 : 0)); vga_w(NULL, dac_regs[code[0] & 3], code[1]); count--; code += 2; From 1cc9fb6dbf915e5c7e7e59bb7fab10572ddbb349 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:25:35 -0700 Subject: [PATCH 459/478] uvesafb: bitwise OR has higher precedence than ?: Signed-off-by: Roel Kluin Acked-by: Michal Januszewski Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/uvesafb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c index 398fd25e0810..c288270e42f7 100644 --- a/drivers/video/uvesafb.c +++ b/drivers/video/uvesafb.c @@ -1552,7 +1552,7 @@ static void __devinit uvesafb_init_info(struct fb_info *info, } info->flags = FBINFO_FLAG_DEFAULT | - (par->ypan) ? FBINFO_HWACCEL_YPAN : 0; + (par->ypan ? FBINFO_HWACCEL_YPAN : 0); if (!par->ypan) info->fbops->fb_pan_display = NULL; From b83734ec0975e1f53420b7a2d454612fc905a9d0 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:25:36 -0700 Subject: [PATCH 460/478] vesafb: bitwise OR has higher precedence than ?: Signed-off-by: Roel Kluin Acked-by: Michal Januszewski Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/vesafb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index e16322d157d0..d6856f43d241 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -438,7 +438,7 @@ static int __init vesafb_probe(struct platform_device *dev) info->var = vesafb_defined; info->fix = vesafb_fix; info->flags = FBINFO_FLAG_DEFAULT | - (ypan) ? FBINFO_HWACCEL_YPAN : 0; + (ypan ? FBINFO_HWACCEL_YPAN : 0); if (!ypan) info->fbops->fb_pan_display = NULL; From 2bd8c47597b2522795f5eb2e61c22dcfec5dfa6a Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 31 Mar 2009 15:25:36 -0700 Subject: [PATCH 461/478] viafb: returns 0 two too early Otherwise this will already return 0 if iteration MAXLOOP-2 occurs in the first loop. Signed-off-by: Roel Kluin Cc: Joseph Chan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/via/accel.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/video/via/accel.c b/drivers/video/via/accel.c index 632523ff1fb7..45c54bfe99bb 100644 --- a/drivers/video/via/accel.c +++ b/drivers/video/via/accel.c @@ -267,13 +267,17 @@ int viafb_wait_engine_idle(void) int loop = 0; while (!(readl(viaparinfo->io_virt + VIA_REG_STATUS) & - VIA_VR_QUEUE_BUSY) && (loop++ < MAXLOOP)) + VIA_VR_QUEUE_BUSY) && (loop < MAXLOOP)) { + loop++; cpu_relax(); + } while ((readl(viaparinfo->io_virt + VIA_REG_STATUS) & (VIA_CMD_RGTR_BUSY | VIA_2D_ENG_BUSY | VIA_3D_ENG_BUSY)) && - (loop++ < MAXLOOP)) + (loop < MAXLOOP)) { + loop++; cpu_relax(); + } return loop >= MAXLOOP; } From 4c8714310afbaabd94ac30db1e499a90e4a69c4e Mon Sep 17 00:00:00 2001 From: Herton Ronaldo Krzesinski Date: Tue, 31 Mar 2009 15:25:37 -0700 Subject: [PATCH 462/478] n411: add missing Makefile entry There is no entry for n411.c to be built, include one in Makefile. Signed-off-by: Herton Ronaldo Krzesinski Cc: Jaya Kumar Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/Makefile b/drivers/video/Makefile index bb265eca7d57..77cfa807d169 100644 --- a/drivers/video/Makefile +++ b/drivers/video/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_FB_ATARI) += atafb.o c2p_iplan2.o atafb_mfb.o \ atafb_iplan2p2.o atafb_iplan2p4.o atafb_iplan2p8.o obj-$(CONFIG_FB_MAC) += macfb.o obj-$(CONFIG_FB_HECUBA) += hecubafb.o +obj-$(CONFIG_FB_N411) += n411.o obj-$(CONFIG_FB_HGA) += hgafb.o obj-$(CONFIG_FB_XVR500) += sunxvr500.o obj-$(CONFIG_FB_XVR2500) += sunxvr2500.o From ec549a0fdc32171b26677f1ef0b5309faa743362 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 31 Mar 2009 15:25:39 -0700 Subject: [PATCH 463/478] fb: add s3c-fb driver for newer Samsung SoC framebuffer devices Add support for the newer Samsung devices, such as found in the S3C2443, S3C6400 or S3C6410 series SoC. It currently does not support all the alpha- or chroma-key options but it will support more exporting more than one framebuffer ready for adding overlay and blending functions. Signed-off-by: Ben Dooks Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/Kconfig | 24 + drivers/video/Makefile | 1 + drivers/video/s3c-fb.c | 1036 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1061 insertions(+) create mode 100644 drivers/video/s3c-fb.c diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index 41c27a44bd82..c19f6feb4e53 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -1920,6 +1920,30 @@ config FB_TMIO_ACCELL depends on FB_TMIO default y +config FB_S3C + tristate "Samsung S3C framebuffer support" + depends on FB && ARCH_S3C64XX + select FB_CFB_FILLRECT + select FB_CFB_COPYAREA + select FB_CFB_IMAGEBLIT + ---help--- + Frame buffer driver for the built-in FB controller in the Samsung + SoC line from the S3C2443 onwards, including the S3C2416, S3C2450, + and the S3C64XX series such as the S3C6400 and S3C6410. + + These chips all have the same basic framebuffer design with the + actual capabilities depending on the chip. For instance the S3C6400 + and S3C6410 support 4 hardware windows whereas the S3C24XX series + currently only have two. + + Currently the support is only for the S3C6400 and S3C6410 SoCs. + +config FB_S3C_DEBUG_REGWRITE + bool "Debug register writes" + depends on FB_S3C + ---help--- + Show all register writes via printk(KERN_DEBUG) + config FB_S3C2410 tristate "S3C2410 LCD framebuffer support" depends on FB && ARCH_S3C2410 diff --git a/drivers/video/Makefile b/drivers/video/Makefile index 77cfa807d169..0dbd6c68d76b 100644 --- a/drivers/video/Makefile +++ b/drivers/video/Makefile @@ -111,6 +111,7 @@ obj-$(CONFIG_FB_BROADSHEET) += broadsheetfb.o obj-$(CONFIG_FB_S1D13XXX) += s1d13xxxfb.o obj-$(CONFIG_FB_SH7760) += sh7760fb.o obj-$(CONFIG_FB_IMX) += imxfb.o +obj-$(CONFIG_FB_S3C) += s3c-fb.o obj-$(CONFIG_FB_S3C2410) += s3c2410fb.o obj-$(CONFIG_FB_FSL_DIU) += fsl-diu-fb.o obj-$(CONFIG_FB_COBALT) += cobalt_lcdfb.o diff --git a/drivers/video/s3c-fb.c b/drivers/video/s3c-fb.c new file mode 100644 index 000000000000..5e9c6302433b --- /dev/null +++ b/drivers/video/s3c-fb.c @@ -0,0 +1,1036 @@ +/* linux/drivers/video/s3c-fb.c + * + * Copyright 2008 Openmoko Inc. + * Copyright 2008 Simtec Electronics + * Ben Dooks + * http://armlinux.simtec.co.uk/ + * + * Samsung SoC Framebuffer driver + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* This driver will export a number of framebuffer interfaces depending + * on the configuration passed in via the platform data. Each fb instance + * maps to a hardware window. Currently there is no support for runtime + * setting of the alpha-blending functions that each window has, so only + * window 0 is actually useful. + * + * Window 0 is treated specially, it is used for the basis of the LCD + * output timings and as the control for the output power-down state. +*/ + +/* note, some of the functions that get called are derived from including + * as they are specific to the architecture that the code + * is being built for. +*/ + +#ifdef CONFIG_FB_S3C_DEBUG_REGWRITE +#undef writel +#define writel(v, r) do { \ + printk(KERN_DEBUG "%s: %08x => %p\n", __func__, (unsigned int)v, r); \ + __raw_writel(v, r); } while(0) +#endif /* FB_S3C_DEBUG_REGWRITE */ + +struct s3c_fb; + +/** + * struct s3c_fb_win - per window private data for each framebuffer. + * @windata: The platform data supplied for the window configuration. + * @parent: The hardware that this window is part of. + * @fbinfo: Pointer pack to the framebuffer info for this window. + * @palette_buffer: Buffer/cache to hold palette entries. + * @pseudo_palette: For use in TRUECOLOUR modes for entries 0..15/ + * @index: The window number of this window. + * @palette: The bitfields for changing r/g/b into a hardware palette entry. + */ +struct s3c_fb_win { + struct s3c_fb_pd_win *windata; + struct s3c_fb *parent; + struct fb_info *fbinfo; + struct s3c_fb_palette palette; + + u32 *palette_buffer; + u32 pseudo_palette[16]; + unsigned int index; +}; + +/** + * struct s3c_fb - overall hardware state of the hardware + * @dev: The device that we bound to, for printing, etc. + * @regs_res: The resource we claimed for the IO registers. + * @bus_clk: The clk (hclk) feeding our interface and possibly pixclk. + * @regs: The mapped hardware registers. + * @enabled: A bitmask of enabled hardware windows. + * @pdata: The platform configuration data passed with the device. + * @windows: The hardware windows that have been claimed. + */ +struct s3c_fb { + struct device *dev; + struct resource *regs_res; + struct clk *bus_clk; + void __iomem *regs; + + unsigned char enabled; + + struct s3c_fb_platdata *pdata; + struct s3c_fb_win *windows[S3C_FB_MAX_WIN]; +}; + +/** + * s3c_fb_win_has_palette() - determine if a mode has a palette + * @win: The window number being queried. + * @bpp: The number of bits per pixel to test. + * + * Work out if the given window supports palletised data at the specified bpp. + */ +static int s3c_fb_win_has_palette(unsigned int win, unsigned int bpp) +{ + return s3c_fb_win_pal_size(win) <= (1 << bpp); +} + +/** + * s3c_fb_check_var() - framebuffer layer request to verify a given mode. + * @var: The screen information to verify. + * @info: The framebuffer device. + * + * Framebuffer layer call to verify the given information and allow us to + * update various information depending on the hardware capabilities. + */ +static int s3c_fb_check_var(struct fb_var_screeninfo *var, + struct fb_info *info) +{ + struct s3c_fb_win *win = info->par; + struct s3c_fb_pd_win *windata = win->windata; + struct s3c_fb *sfb = win->parent; + + dev_dbg(sfb->dev, "checking parameters\n"); + + var->xres_virtual = max((unsigned int)windata->virtual_x, var->xres); + var->yres_virtual = max((unsigned int)windata->virtual_y, var->yres); + + if (!s3c_fb_validate_win_bpp(win->index, var->bits_per_pixel)) { + dev_dbg(sfb->dev, "win %d: unsupported bpp %d\n", + win->index, var->bits_per_pixel); + return -EINVAL; + } + + /* always ensure these are zero, for drop through cases below */ + var->transp.offset = 0; + var->transp.length = 0; + + switch (var->bits_per_pixel) { + case 1: + case 2: + case 4: + case 8: + if (!s3c_fb_win_has_palette(win->index, var->bits_per_pixel)) { + /* non palletised, A:1,R:2,G:3,B:2 mode */ + var->red.offset = 4; + var->green.offset = 2; + var->blue.offset = 0; + var->red.length = 5; + var->green.length = 3; + var->blue.length = 2; + var->transp.offset = 7; + var->transp.length = 1; + } else { + var->red.offset = 0; + var->red.length = var->bits_per_pixel; + var->green = var->red; + var->blue = var->red; + } + break; + + case 19: + /* 666 with one bit alpha/transparency */ + var->transp.offset = 18; + var->transp.length = 1; + case 18: + var->bits_per_pixel = 32; + + /* 666 format */ + var->red.offset = 12; + var->green.offset = 6; + var->blue.offset = 0; + var->red.length = 6; + var->green.length = 6; + var->blue.length = 6; + break; + + case 16: + /* 16 bpp, 565 format */ + var->red.offset = 11; + var->green.offset = 5; + var->blue.offset = 0; + var->red.length = 5; + var->green.length = 6; + var->blue.length = 5; + break; + + case 28: + case 25: + var->transp.length = var->bits_per_pixel - 24; + var->transp.offset = 24; + /* drop through */ + case 24: + /* our 24bpp is unpacked, so 32bpp */ + var->bits_per_pixel = 32; + case 32: + var->red.offset = 16; + var->red.length = 8; + var->green.offset = 8; + var->green.length = 8; + var->blue.offset = 0; + var->blue.length = 8; + break; + + default: + dev_err(sfb->dev, "invalid bpp\n"); + } + + dev_dbg(sfb->dev, "%s: verified parameters\n", __func__); + return 0; +} + +/** + * s3c_fb_calc_pixclk() - calculate the divider to create the pixel clock. + * @sfb: The hardware state. + * @pixclock: The pixel clock wanted, in picoseconds. + * + * Given the specified pixel clock, work out the necessary divider to get + * close to the output frequency. + */ +static int s3c_fb_calc_pixclk(struct s3c_fb *sfb, unsigned int pixclk) +{ + unsigned long clk = clk_get_rate(sfb->bus_clk); + unsigned long long tmp; + unsigned int result; + + tmp = (unsigned long long)clk; + tmp *= pixclk; + + do_div(tmp, 1000000000UL); + result = (unsigned int)tmp / 1000; + + dev_dbg(sfb->dev, "pixclk=%u, clk=%lu, div=%d (%lu)\n", + pixclk, clk, result, clk / result); + + return result; +} + +/** + * s3c_fb_align_word() - align pixel count to word boundary + * @bpp: The number of bits per pixel + * @pix: The value to be aligned. + * + * Align the given pixel count so that it will start on an 32bit word + * boundary. + */ +static int s3c_fb_align_word(unsigned int bpp, unsigned int pix) +{ + int pix_per_word; + + if (bpp > 16) + return pix; + + pix_per_word = (8 * 32) / bpp; + return ALIGN(pix, pix_per_word); +} + +/** + * s3c_fb_set_par() - framebuffer request to set new framebuffer state. + * @info: The framebuffer to change. + * + * Framebuffer layer request to set a new mode for the specified framebuffer + */ +static int s3c_fb_set_par(struct fb_info *info) +{ + struct fb_var_screeninfo *var = &info->var; + struct s3c_fb_win *win = info->par; + struct s3c_fb *sfb = win->parent; + void __iomem *regs = sfb->regs; + int win_no = win->index; + u32 data; + u32 pagewidth; + int clkdiv; + + dev_dbg(sfb->dev, "setting framebuffer parameters\n"); + + switch (var->bits_per_pixel) { + case 32: + case 24: + case 16: + case 12: + info->fix.visual = FB_VISUAL_TRUECOLOR; + break; + case 8: + if (s3c_fb_win_has_palette(win_no, 8)) + info->fix.visual = FB_VISUAL_PSEUDOCOLOR; + else + info->fix.visual = FB_VISUAL_TRUECOLOR; + break; + case 1: + info->fix.visual = FB_VISUAL_MONO01; + break; + default: + info->fix.visual = FB_VISUAL_PSEUDOCOLOR; + break; + } + + info->fix.line_length = (var->xres_virtual * var->bits_per_pixel) / 8; + + /* disable the window whilst we update it */ + writel(0, regs + WINCON(win_no)); + + /* use window 0 as the basis for the lcd output timings */ + + if (win_no == 0) { + clkdiv = s3c_fb_calc_pixclk(sfb, var->pixclock); + + data = sfb->pdata->vidcon0; + data &= ~(VIDCON0_CLKVAL_F_MASK | VIDCON0_CLKDIR); + + if (clkdiv > 1) + data |= VIDCON0_CLKVAL_F(clkdiv-1) | VIDCON0_CLKDIR; + else + data &= ~VIDCON0_CLKDIR; /* 1:1 clock */ + + /* write the timing data to the panel */ + + data |= VIDCON0_ENVID | VIDCON0_ENVID_F; + writel(data, regs + VIDCON0); + + data = VIDTCON0_VBPD(var->upper_margin - 1) | + VIDTCON0_VFPD(var->lower_margin - 1) | + VIDTCON0_VSPW(var->vsync_len - 1); + + writel(data, regs + VIDTCON0); + + data = VIDTCON1_HBPD(var->left_margin - 1) | + VIDTCON1_HFPD(var->right_margin - 1) | + VIDTCON1_HSPW(var->hsync_len - 1); + + writel(data, regs + VIDTCON1); + + data = VIDTCON2_LINEVAL(var->yres - 1) | + VIDTCON2_HOZVAL(var->xres - 1); + writel(data, regs + VIDTCON2); + } + + /* write the buffer address */ + + writel(info->fix.smem_start, regs + VIDW_BUF_START(win_no)); + + data = info->fix.smem_start + info->fix.line_length * var->yres; + writel(data, regs + VIDW_BUF_END(win_no)); + + pagewidth = (var->xres * var->bits_per_pixel) >> 3; + data = VIDW_BUF_SIZE_OFFSET(info->fix.line_length - pagewidth) | + VIDW_BUF_SIZE_PAGEWIDTH(pagewidth); + writel(data, regs + VIDW_BUF_SIZE(win_no)); + + /* write 'OSD' registers to control position of framebuffer */ + + data = VIDOSDxA_TOPLEFT_X(0) | VIDOSDxA_TOPLEFT_Y(0); + writel(data, regs + VIDOSD_A(win_no)); + + data = VIDOSDxB_BOTRIGHT_X(s3c_fb_align_word(var->bits_per_pixel, + var->xres - 1)) | + VIDOSDxB_BOTRIGHT_Y(var->yres - 1); + + writel(data, regs + VIDOSD_B(win_no)); + + data = var->xres * var->yres; + if (s3c_fb_has_osd_d(win_no)) { + writel(data, regs + VIDOSD_D(win_no)); + writel(0, regs + VIDOSD_C(win_no)); + } else + writel(data, regs + VIDOSD_C(win_no)); + + data = WINCONx_ENWIN; + + /* note, since we have to round up the bits-per-pixel, we end up + * relying on the bitfield information for r/g/b/a to work out + * exactly which mode of operation is intended. */ + + switch (var->bits_per_pixel) { + case 1: + data |= WINCON0_BPPMODE_1BPP; + data |= WINCONx_BITSWP; + data |= WINCONx_BURSTLEN_4WORD; + break; + case 2: + data |= WINCON0_BPPMODE_2BPP; + data |= WINCONx_BITSWP; + data |= WINCONx_BURSTLEN_8WORD; + break; + case 4: + data |= WINCON0_BPPMODE_4BPP; + data |= WINCONx_BITSWP; + data |= WINCONx_BURSTLEN_8WORD; + break; + case 8: + if (var->transp.length != 0) + data |= WINCON1_BPPMODE_8BPP_1232; + else + data |= WINCON0_BPPMODE_8BPP_PALETTE; + data |= WINCONx_BURSTLEN_8WORD; + data |= WINCONx_BYTSWP; + break; + case 16: + if (var->transp.length != 0) + data |= WINCON1_BPPMODE_16BPP_A1555; + else + data |= WINCON0_BPPMODE_16BPP_565; + data |= WINCONx_HAWSWP; + data |= WINCONx_BURSTLEN_16WORD; + break; + case 24: + case 32: + if (var->red.length == 6) { + if (var->transp.length != 0) + data |= WINCON1_BPPMODE_19BPP_A1666; + else + data |= WINCON1_BPPMODE_18BPP_666; + } else if (var->transp.length != 0) + data |= WINCON1_BPPMODE_25BPP_A1888; + else + data |= WINCON0_BPPMODE_24BPP_888; + + data |= WINCONx_BURSTLEN_16WORD; + break; + } + + writel(data, regs + WINCON(win_no)); + writel(0x0, regs + WINxMAP(win_no)); + + return 0; +} + +/** + * s3c_fb_update_palette() - set or schedule a palette update. + * @sfb: The hardware information. + * @win: The window being updated. + * @reg: The palette index being changed. + * @value: The computed palette value. + * + * Change the value of a palette register, either by directly writing to + * the palette (this requires the palette RAM to be disconnected from the + * hardware whilst this is in progress) or schedule the update for later. + * + * At the moment, since we have no VSYNC interrupt support, we simply set + * the palette entry directly. + */ +static void s3c_fb_update_palette(struct s3c_fb *sfb, + struct s3c_fb_win *win, + unsigned int reg, + u32 value) +{ + void __iomem *palreg; + u32 palcon; + + palreg = sfb->regs + s3c_fb_pal_reg(win->index, reg); + + dev_dbg(sfb->dev, "%s: win %d, reg %d (%p): %08x\n", + __func__, win->index, reg, palreg, value); + + win->palette_buffer[reg] = value; + + palcon = readl(sfb->regs + WPALCON); + writel(palcon | WPALCON_PAL_UPDATE, sfb->regs + WPALCON); + + if (s3c_fb_pal_is16(win->index)) + writew(value, palreg); + else + writel(value, palreg); + + writel(palcon, sfb->regs + WPALCON); +} + +static inline unsigned int chan_to_field(unsigned int chan, + struct fb_bitfield *bf) +{ + chan &= 0xffff; + chan >>= 16 - bf->length; + return chan << bf->offset; +} + +/** + * s3c_fb_setcolreg() - framebuffer layer request to change palette. + * @regno: The palette index to change. + * @red: The red field for the palette data. + * @green: The green field for the palette data. + * @blue: The blue field for the palette data. + * @trans: The transparency (alpha) field for the palette data. + * @info: The framebuffer being changed. + */ +static int s3c_fb_setcolreg(unsigned regno, + unsigned red, unsigned green, unsigned blue, + unsigned transp, struct fb_info *info) +{ + struct s3c_fb_win *win = info->par; + struct s3c_fb *sfb = win->parent; + unsigned int val; + + dev_dbg(sfb->dev, "%s: win %d: %d => rgb=%d/%d/%d\n", + __func__, win->index, regno, red, green, blue); + + switch (info->fix.visual) { + case FB_VISUAL_TRUECOLOR: + /* true-colour, use pseudo-palette */ + + if (regno < 16) { + u32 *pal = info->pseudo_palette; + + val = chan_to_field(red, &info->var.red); + val |= chan_to_field(green, &info->var.green); + val |= chan_to_field(blue, &info->var.blue); + + pal[regno] = val; + } + break; + + case FB_VISUAL_PSEUDOCOLOR: + if (regno < s3c_fb_win_pal_size(win->index)) { + val = chan_to_field(red, &win->palette.r); + val |= chan_to_field(green, &win->palette.g); + val |= chan_to_field(blue, &win->palette.b); + + s3c_fb_update_palette(sfb, win, regno, val); + } + + break; + + default: + return 1; /* unknown type */ + } + + return 0; +} + +/** + * s3c_fb_enable() - Set the state of the main LCD output + * @sfb: The main framebuffer state. + * @enable: The state to set. + */ +static void s3c_fb_enable(struct s3c_fb *sfb, int enable) +{ + u32 vidcon0 = readl(sfb->regs + VIDCON0); + + if (enable) + vidcon0 |= VIDCON0_ENVID | VIDCON0_ENVID_F; + else { + /* see the note in the framebuffer datasheet about + * why you cannot take both of these bits down at the + * same time. */ + + if (!(vidcon0 & VIDCON0_ENVID)) + return; + + vidcon0 |= VIDCON0_ENVID; + vidcon0 &= ~VIDCON0_ENVID_F; + } + + writel(vidcon0, sfb->regs + VIDCON0); +} + +/** + * s3c_fb_blank() - blank or unblank the given window + * @blank_mode: The blank state from FB_BLANK_* + * @info: The framebuffer to blank. + * + * Framebuffer layer request to change the power state. + */ +static int s3c_fb_blank(int blank_mode, struct fb_info *info) +{ + struct s3c_fb_win *win = info->par; + struct s3c_fb *sfb = win->parent; + unsigned int index = win->index; + u32 wincon; + + dev_dbg(sfb->dev, "blank mode %d\n", blank_mode); + + wincon = readl(sfb->regs + WINCON(index)); + + switch (blank_mode) { + case FB_BLANK_POWERDOWN: + wincon &= ~WINCONx_ENWIN; + sfb->enabled &= ~(1 << index); + /* fall through to FB_BLANK_NORMAL */ + + case FB_BLANK_NORMAL: + /* disable the DMA and display 0x0 (black) */ + writel(WINxMAP_MAP | WINxMAP_MAP_COLOUR(0x0), + sfb->regs + WINxMAP(index)); + break; + + case FB_BLANK_UNBLANK: + writel(0x0, sfb->regs + WINxMAP(index)); + wincon |= WINCONx_ENWIN; + sfb->enabled |= (1 << index); + break; + + case FB_BLANK_VSYNC_SUSPEND: + case FB_BLANK_HSYNC_SUSPEND: + default: + return 1; + } + + writel(wincon, sfb->regs + WINCON(index)); + + /* Check the enabled state to see if we need to be running the + * main LCD interface, as if there are no active windows then + * it is highly likely that we also do not need to output + * anything. + */ + + /* We could do something like the following code, but the current + * system of using framebuffer events means that we cannot make + * the distinction between just window 0 being inactive and all + * the windows being down. + * + * s3c_fb_enable(sfb, sfb->enabled ? 1 : 0); + */ + + /* we're stuck with this until we can do something about overriding + * the power control using the blanking event for a single fb. + */ + if (index == 0) + s3c_fb_enable(sfb, blank_mode != FB_BLANK_POWERDOWN ? 1 : 0); + + return 0; +} + +static struct fb_ops s3c_fb_ops = { + .owner = THIS_MODULE, + .fb_check_var = s3c_fb_check_var, + .fb_set_par = s3c_fb_set_par, + .fb_blank = s3c_fb_blank, + .fb_setcolreg = s3c_fb_setcolreg, + .fb_fillrect = cfb_fillrect, + .fb_copyarea = cfb_copyarea, + .fb_imageblit = cfb_imageblit, +}; + +/** + * s3c_fb_alloc_memory() - allocate display memory for framebuffer window + * @sfb: The base resources for the hardware. + * @win: The window to initialise memory for. + * + * Allocate memory for the given framebuffer. + */ +static int __devinit s3c_fb_alloc_memory(struct s3c_fb *sfb, + struct s3c_fb_win *win) +{ + struct s3c_fb_pd_win *windata = win->windata; + unsigned int real_size, virt_size, size; + struct fb_info *fbi = win->fbinfo; + dma_addr_t map_dma; + + dev_dbg(sfb->dev, "allocating memory for display\n"); + + real_size = windata->win_mode.xres * windata->win_mode.yres; + virt_size = windata->virtual_x * windata->virtual_y; + + dev_dbg(sfb->dev, "real_size=%u (%u.%u), virt_size=%u (%u.%u)\n", + real_size, windata->win_mode.xres, windata->win_mode.yres, + virt_size, windata->virtual_x, windata->virtual_y); + + size = (real_size > virt_size) ? real_size : virt_size; + size *= (windata->max_bpp > 16) ? 32 : windata->max_bpp; + size /= 8; + + fbi->fix.smem_len = size; + size = PAGE_ALIGN(size); + + dev_dbg(sfb->dev, "want %u bytes for window\n", size); + + fbi->screen_base = dma_alloc_writecombine(sfb->dev, size, + &map_dma, GFP_KERNEL); + if (!fbi->screen_base) + return -ENOMEM; + + dev_dbg(sfb->dev, "mapped %x to %p\n", + (unsigned int)map_dma, fbi->screen_base); + + memset(fbi->screen_base, 0x0, size); + fbi->fix.smem_start = map_dma; + + return 0; +} + +/** + * s3c_fb_free_memory() - free the display memory for the given window + * @sfb: The base resources for the hardware. + * @win: The window to free the display memory for. + * + * Free the display memory allocated by s3c_fb_alloc_memory(). + */ +static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win) +{ + struct fb_info *fbi = win->fbinfo; + + dma_free_writecombine(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len), + fbi->screen_base, fbi->fix.smem_start); +} + +/** + * s3c_fb_release_win() - release resources for a framebuffer window. + * @win: The window to cleanup the resources for. + * + * Release the resources that where claimed for the hardware window, + * such as the framebuffer instance and any memory claimed for it. + */ +static void s3c_fb_release_win(struct s3c_fb *sfb, struct s3c_fb_win *win) +{ + fb_dealloc_cmap(&win->fbinfo->cmap); + unregister_framebuffer(win->fbinfo); + s3c_fb_free_memory(sfb, win); +} + +/** + * s3c_fb_probe_win() - register an hardware window + * @sfb: The base resources for the hardware + * @res: Pointer to where to place the resultant window. + * + * Allocate and do the basic initialisation for one of the hardware's graphics + * windows. + */ +static int __devinit s3c_fb_probe_win(struct s3c_fb *sfb, unsigned int win_no, + struct s3c_fb_win **res) +{ + struct fb_var_screeninfo *var; + struct fb_videomode *initmode; + struct s3c_fb_pd_win *windata; + struct s3c_fb_win *win; + struct fb_info *fbinfo; + int palette_size; + int ret; + + dev_dbg(sfb->dev, "probing window %d\n", win_no); + + palette_size = s3c_fb_win_pal_size(win_no); + + fbinfo = framebuffer_alloc(sizeof(struct s3c_fb_win) + + palette_size * sizeof(u32), sfb->dev); + if (!fbinfo) { + dev_err(sfb->dev, "failed to allocate framebuffer\n"); + return -ENOENT; + } + + windata = sfb->pdata->win[win_no]; + initmode = &windata->win_mode; + + WARN_ON(windata->max_bpp == 0); + WARN_ON(windata->win_mode.xres == 0); + WARN_ON(windata->win_mode.yres == 0); + + win = fbinfo->par; + var = &fbinfo->var; + win->fbinfo = fbinfo; + win->parent = sfb; + win->windata = windata; + win->index = win_no; + win->palette_buffer = (u32 *)(win + 1); + + ret = s3c_fb_alloc_memory(sfb, win); + if (ret) { + dev_err(sfb->dev, "failed to allocate display memory\n"); + goto err_framebuffer; + } + + /* setup the r/b/g positions for the window's palette */ + s3c_fb_init_palette(win_no, &win->palette); + + /* setup the initial video mode from the window */ + fb_videomode_to_var(&fbinfo->var, initmode); + + fbinfo->fix.type = FB_TYPE_PACKED_PIXELS; + fbinfo->fix.accel = FB_ACCEL_NONE; + fbinfo->var.activate = FB_ACTIVATE_NOW; + fbinfo->var.vmode = FB_VMODE_NONINTERLACED; + fbinfo->var.bits_per_pixel = windata->default_bpp; + fbinfo->fbops = &s3c_fb_ops; + fbinfo->flags = FBINFO_FLAG_DEFAULT; + fbinfo->pseudo_palette = &win->pseudo_palette; + + /* prepare to actually start the framebuffer */ + + ret = s3c_fb_check_var(&fbinfo->var, fbinfo); + if (ret < 0) { + dev_err(sfb->dev, "check_var failed on initial video params\n"); + goto err_alloc_mem; + } + + /* create initial colour map */ + + ret = fb_alloc_cmap(&fbinfo->cmap, s3c_fb_win_pal_size(win_no), 1); + if (ret == 0) + fb_set_cmap(&fbinfo->cmap, fbinfo); + else + dev_err(sfb->dev, "failed to allocate fb cmap\n"); + + s3c_fb_set_par(fbinfo); + + dev_dbg(sfb->dev, "about to register framebuffer\n"); + + /* run the check_var and set_par on our configuration. */ + + ret = register_framebuffer(fbinfo); + if (ret < 0) { + dev_err(sfb->dev, "failed to register framebuffer\n"); + goto err_alloc_mem; + } + + *res = win; + dev_info(sfb->dev, "window %d: fb %s\n", win_no, fbinfo->fix.id); + + return 0; + +err_alloc_mem: + s3c_fb_free_memory(sfb, win); + +err_framebuffer: + unregister_framebuffer(fbinfo); + return ret; +} + +/** + * s3c_fb_clear_win() - clear hardware window registers. + * @sfb: The base resources for the hardware. + * @win: The window to process. + * + * Reset the specific window registers to a known state. + */ +static void s3c_fb_clear_win(struct s3c_fb *sfb, int win) +{ + void __iomem *regs = sfb->regs; + + writel(0, regs + WINCON(win)); + writel(0xffffff, regs + WxKEYCONy(win, 0)); + writel(0xffffff, regs + WxKEYCONy(win, 1)); + + writel(0, regs + VIDOSD_A(win)); + writel(0, regs + VIDOSD_B(win)); + writel(0, regs + VIDOSD_C(win)); +} + +static int __devinit s3c_fb_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct s3c_fb_platdata *pd; + struct s3c_fb *sfb; + struct resource *res; + int win; + int ret = 0; + + pd = pdev->dev.platform_data; + if (!pd) { + dev_err(dev, "no platform data specified\n"); + return -EINVAL; + } + + sfb = kzalloc(sizeof(struct s3c_fb), GFP_KERNEL); + if (!sfb) { + dev_err(dev, "no memory for framebuffers\n"); + return -ENOMEM; + } + + sfb->dev = dev; + sfb->pdata = pd; + + sfb->bus_clk = clk_get(dev, "lcd"); + if (IS_ERR(sfb->bus_clk)) { + dev_err(dev, "failed to get bus clock\n"); + goto err_sfb; + } + + clk_enable(sfb->bus_clk); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(dev, "failed to find registers\n"); + ret = -ENOENT; + goto err_clk; + } + + sfb->regs_res = request_mem_region(res->start, resource_size(res), + dev_name(dev)); + if (!sfb->regs_res) { + dev_err(dev, "failed to claim register region\n"); + ret = -ENOENT; + goto err_clk; + } + + sfb->regs = ioremap(res->start, resource_size(res)); + if (!sfb->regs) { + dev_err(dev, "failed to map registers\n"); + ret = -ENXIO; + goto err_req_region; + } + + dev_dbg(dev, "got resources (regs %p), probing windows\n", sfb->regs); + + /* setup gpio and output polarity controls */ + + pd->setup_gpio(); + + writel(pd->vidcon1, sfb->regs + VIDCON1); + + /* zero all windows before we do anything */ + + for (win = 0; win < S3C_FB_MAX_WIN; win++) + s3c_fb_clear_win(sfb, win); + + /* we have the register setup, start allocating framebuffers */ + + for (win = 0; win < S3C_FB_MAX_WIN; win++) { + if (!pd->win[win]) + continue; + + ret = s3c_fb_probe_win(sfb, win, &sfb->windows[win]); + if (ret < 0) { + dev_err(dev, "failed to create window %d\n", win); + for (; win >= 0; win--) + s3c_fb_release_win(sfb, sfb->windows[win]); + goto err_ioremap; + } + } + + platform_set_drvdata(pdev, sfb); + + return 0; + +err_ioremap: + iounmap(sfb->regs); + +err_req_region: + release_resource(sfb->regs_res); + kfree(sfb->regs_res); + +err_clk: + clk_disable(sfb->bus_clk); + clk_put(sfb->bus_clk); + +err_sfb: + kfree(sfb); + return ret; +} + +/** + * s3c_fb_remove() - Cleanup on module finalisation + * @pdev: The platform device we are bound to. + * + * Shutdown and then release all the resources that the driver allocated + * on initialisation. + */ +static int __devexit s3c_fb_remove(struct platform_device *pdev) +{ + struct s3c_fb *sfb = platform_get_drvdata(pdev); + int win; + + for (win = 0; win <= S3C_FB_MAX_WIN; win++) + s3c_fb_release_win(sfb, sfb->windows[win]); + + iounmap(sfb->regs); + + clk_disable(sfb->bus_clk); + clk_put(sfb->bus_clk); + + release_resource(sfb->regs_res); + kfree(sfb->regs_res); + + kfree(sfb); + + return 0; +} + +#ifdef CONFIG_PM +static int s3c_fb_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct s3c_fb *sfb = platform_get_drvdata(pdev); + struct s3c_fb_win *win; + int win_no; + + for (win_no = S3C_FB_MAX_WIN; win_no >= 0; win_no--) { + win = sfb->windows[win_no]; + if (!win) + continue; + + /* use the blank function to push into power-down */ + s3c_fb_blank(FB_BLANK_POWERDOWN, win->fbinfo); + } + + clk_disable(sfb->bus_clk); + return 0; +} + +static int s3c_fb_resume(struct platform_device *pdev) +{ + struct s3c_fb *sfb = platform_get_drvdata(pdev); + struct s3c_fb_win *win; + int win_no; + + clk_enable(sfb->bus_clk); + + for (win_no = 0; win_no < S3C_FB_MAX_WIN; win_no++) { + win = sfb->windows[win_no]; + if (!win) + continue; + + dev_dbg(&pdev->dev, "resuming window %d\n", win_no); + s3c_fb_set_par(win->fbinfo); + } + + return 0; +} +#else +#define s3c_fb_suspend NULL +#define s3c_fb_resume NULL +#endif + +static struct platform_driver s3c_fb_driver = { + .probe = s3c_fb_probe, + .remove = s3c_fb_remove, + .suspend = s3c_fb_suspend, + .resume = s3c_fb_resume, + .driver = { + .name = "s3c-fb", + .owner = THIS_MODULE, + }, +}; + +static int __init s3c_fb_init(void) +{ + return platform_driver_register(&s3c_fb_driver); +} + +static void __exit s3c_fb_cleanup(void) +{ + platform_driver_unregister(&s3c_fb_driver); +} + +module_init(s3c_fb_init); +module_exit(s3c_fb_cleanup); + +MODULE_AUTHOR("Ben Dooks "); +MODULE_DESCRIPTION("Samsung S3C SoC Framebuffer driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:s3c-fb"); From ddb53d48da5b0e691f35e703ac29118747f86c99 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:40 -0700 Subject: [PATCH 464/478] fbdev: remove cyblafb driver A tridentfb driver has all the functionality of the cyblafb driver without the bugs of the latter. Changes to the tridentfb driver: - FBINFO_READS_FAST added to the tridentfb. The cyblafb used a blitter for scrolling which is faster than color expansion on Cyberblade chipsets. The blitter is slower on a discrete Blade3D core. Use the blitter for scrolling in the tridentfb only for integrated Blade3D cores. Now, scrolling speed is about equal for the tridentfb and the cyblafb. - a copyright notice addition is done on request of Jani Monoses (the first author of the tridentfb). Tested on AGP Blade3D card and PCChips M787CLR motherboard: VIA C3 cpu + VT8601 north bridge (aka Cyberblade/i1). Signed-off-by: Krzysztof Helt Cc: "Jani Monoses" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/fb/00-INDEX | 2 - Documentation/fb/cyblafb/bugs | 13 - Documentation/fb/cyblafb/credits | 7 - Documentation/fb/cyblafb/documentation | 17 - Documentation/fb/cyblafb/fb.modes | 154 --- Documentation/fb/cyblafb/performance | 79 -- Documentation/fb/cyblafb/todo | 31 - Documentation/fb/cyblafb/usage | 217 --- Documentation/fb/cyblafb/whatsnew | 29 - Documentation/fb/cyblafb/whycyblafb | 85 -- drivers/video/Kconfig | 28 +- drivers/video/cyblafb.c | 1683 ------------------------ drivers/video/tridentfb.c | 6 +- 13 files changed, 7 insertions(+), 2344 deletions(-) delete mode 100644 Documentation/fb/cyblafb/bugs delete mode 100644 Documentation/fb/cyblafb/credits delete mode 100644 Documentation/fb/cyblafb/documentation delete mode 100644 Documentation/fb/cyblafb/fb.modes delete mode 100644 Documentation/fb/cyblafb/performance delete mode 100644 Documentation/fb/cyblafb/todo delete mode 100644 Documentation/fb/cyblafb/usage delete mode 100644 Documentation/fb/cyblafb/whatsnew delete mode 100644 Documentation/fb/cyblafb/whycyblafb delete mode 100644 drivers/video/cyblafb.c diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX index caabbd395e61..a618fd99c9f0 100644 --- a/Documentation/fb/00-INDEX +++ b/Documentation/fb/00-INDEX @@ -11,8 +11,6 @@ aty128fb.txt - info on the ATI Rage128 frame buffer driver. cirrusfb.txt - info on the driver for Cirrus Logic chipsets. -cyblafb/ - - directory with documentation files related to the cyblafb driver. deferred_io.txt - an introduction to deferred IO. fbcon.txt diff --git a/Documentation/fb/cyblafb/bugs b/Documentation/fb/cyblafb/bugs deleted file mode 100644 index 9443a6d72cdd..000000000000 --- a/Documentation/fb/cyblafb/bugs +++ /dev/null @@ -1,13 +0,0 @@ -Bugs -==== - -I currently don't know of any bug. Please do send reports to: - - linux-fbdev-devel@lists.sourceforge.net - - Knut_Petersen@t-online.de. - - -Untested features -================= - -All LCD stuff is untested. If it worked in tridentfb, it should work in -cyblafb. Please test and report the results to Knut_Petersen@t-online.de. diff --git a/Documentation/fb/cyblafb/credits b/Documentation/fb/cyblafb/credits deleted file mode 100644 index 0eb3b443dc2b..000000000000 --- a/Documentation/fb/cyblafb/credits +++ /dev/null @@ -1,7 +0,0 @@ -Thanks to -========= - * Alan Hourihane, for writing the X trident driver - * Jani Monoses, for writing the tridentfb driver - * Antonino A. Daplas, for review of the first published - version of cyblafb and some code - * Jochen Hein, for testing and a helpfull bug report diff --git a/Documentation/fb/cyblafb/documentation b/Documentation/fb/cyblafb/documentation deleted file mode 100644 index bb1aac048425..000000000000 --- a/Documentation/fb/cyblafb/documentation +++ /dev/null @@ -1,17 +0,0 @@ -Available Documentation -======================= - -Apollo PLE 133 Chipset VT8601A North Bridge Datasheet, Rev. 1.82, October 22, -2001, available from VIA: - - http://www.viavpsd.com/product/6/15/DS8601A182.pdf - -The datasheet is incomplete, some registers that need to be programmed are not -explained at all and important bits are listed as "reserved". But you really -need the datasheet to understand the code. "p. xxx" comments refer to page -numbers of this document. - -XFree/XOrg drivers are available and of good quality, looking at the code -there is a good idea if the datasheet does not provide enough information -or if the datasheet seems to be wrong. - diff --git a/Documentation/fb/cyblafb/fb.modes b/Documentation/fb/cyblafb/fb.modes deleted file mode 100644 index fe0e5223ba86..000000000000 --- a/Documentation/fb/cyblafb/fb.modes +++ /dev/null @@ -1,154 +0,0 @@ -# -# Sample fb.modes file -# -# Provides an incomplete list of working modes for -# the cyberblade/i1 graphics core. -# -# The value 4294967256 is used instead of -40. Of course, -40 is not -# a really reasonable value, but chip design does not always follow -# logic. Believe me, it's ok, and it's the way the BIOS does it. -# -# fbset requires 4294967256 in fb.modes and -40 as an argument to -# the -t parameter. That's also not too reasonable, and it might change -# in the future or might even be differt for your current version. -# - -mode "640x480-50" - geometry 640 480 2048 4096 8 - timings 47619 4294967256 24 17 0 216 3 -endmode - -mode "640x480-60" - geometry 640 480 2048 4096 8 - timings 39682 4294967256 24 17 0 216 3 -endmode - -mode "640x480-70" - geometry 640 480 2048 4096 8 - timings 34013 4294967256 24 17 0 216 3 -endmode - -mode "640x480-72" - geometry 640 480 2048 4096 8 - timings 33068 4294967256 24 17 0 216 3 -endmode - -mode "640x480-75" - geometry 640 480 2048 4096 8 - timings 31746 4294967256 24 17 0 216 3 -endmode - -mode "640x480-80" - geometry 640 480 2048 4096 8 - timings 29761 4294967256 24 17 0 216 3 -endmode - -mode "640x480-85" - geometry 640 480 2048 4096 8 - timings 28011 4294967256 24 17 0 216 3 -endmode - -mode "800x600-50" - geometry 800 600 2048 4096 8 - timings 30303 96 24 14 0 136 11 -endmode - -mode "800x600-60" - geometry 800 600 2048 4096 8 - timings 25252 96 24 14 0 136 11 -endmode - -mode "800x600-70" - geometry 800 600 2048 4096 8 - timings 21645 96 24 14 0 136 11 -endmode - -mode "800x600-72" - geometry 800 600 2048 4096 8 - timings 21043 96 24 14 0 136 11 -endmode - -mode "800x600-75" - geometry 800 600 2048 4096 8 - timings 20202 96 24 14 0 136 11 -endmode - -mode "800x600-80" - geometry 800 600 2048 4096 8 - timings 18939 96 24 14 0 136 11 -endmode - -mode "800x600-85" - geometry 800 600 2048 4096 8 - timings 17825 96 24 14 0 136 11 -endmode - -mode "1024x768-50" - geometry 1024 768 2048 4096 8 - timings 19054 144 24 29 0 120 3 -endmode - -mode "1024x768-60" - geometry 1024 768 2048 4096 8 - timings 15880 144 24 29 0 120 3 -endmode - -mode "1024x768-70" - geometry 1024 768 2048 4096 8 - timings 13610 144 24 29 0 120 3 -endmode - -mode "1024x768-72" - geometry 1024 768 2048 4096 8 - timings 13232 144 24 29 0 120 3 -endmode - -mode "1024x768-75" - geometry 1024 768 2048 4096 8 - timings 12703 144 24 29 0 120 3 -endmode - -mode "1024x768-80" - geometry 1024 768 2048 4096 8 - timings 11910 144 24 29 0 120 3 -endmode - -mode "1024x768-85" - geometry 1024 768 2048 4096 8 - timings 11209 144 24 29 0 120 3 -endmode - -mode "1280x1024-50" - geometry 1280 1024 2048 4096 8 - timings 11114 232 16 39 0 160 3 -endmode - -mode "1280x1024-60" - geometry 1280 1024 2048 4096 8 - timings 9262 232 16 39 0 160 3 -endmode - -mode "1280x1024-70" - geometry 1280 1024 2048 4096 8 - timings 7939 232 16 39 0 160 3 -endmode - -mode "1280x1024-72" - geometry 1280 1024 2048 4096 8 - timings 7719 232 16 39 0 160 3 -endmode - -mode "1280x1024-75" - geometry 1280 1024 2048 4096 8 - timings 7410 232 16 39 0 160 3 -endmode - -mode "1280x1024-80" - geometry 1280 1024 2048 4096 8 - timings 6946 232 16 39 0 160 3 -endmode - -mode "1280x1024-85" - geometry 1280 1024 2048 4096 8 - timings 6538 232 16 39 0 160 3 -endmode diff --git a/Documentation/fb/cyblafb/performance b/Documentation/fb/cyblafb/performance deleted file mode 100644 index 8d15d5dfc6b3..000000000000 --- a/Documentation/fb/cyblafb/performance +++ /dev/null @@ -1,79 +0,0 @@ -Speed -===== - -CyBlaFB is much faster than tridentfb and vesafb. Compare the performance data -for mode 1280x1024-[8,16,32]@61 Hz. - -Test 1: Cat a file with 2000 lines of 0 characters. -Test 2: Cat a file with 2000 lines of 80 characters. -Test 3: Cat a file with 2000 lines of 160 characters. - -All values show system time use in seconds, kernel 2.6.12 was used for -the measurements. 2.6.13 is a bit slower, 2.6.14 hopefully will include a -patch that speeds up kernel bitblitting a lot ( > 20%). - -+-----------+-----------------------------------------------------+ -| | not accelerated | -| TRIDENTFB +-----------------+-----------------+-----------------+ -| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | 4.31 | 4.33 | 6.05 | 12.81 | ---- | ---- | -| Test 2 | 67.94 | 5.44 | 123.16 | 14.79 | ---- | ---- | -| Test 3 | 131.36 | 6.55 | 240.12 | 16.76 | ---- | ---- | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | | | completely bro- | -| | | | ken, monitor | -| | | | switches off | -+-----------+-----------------+-----------------+-----------------+ - - -+-----------+-----------------------------------------------------+ -| | accelerated | -| TRIDENTFB +-----------------+-----------------+-----------------+ -| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | ---- | ---- | 20.62 | 1.22 | ---- | ---- | -| Test 2 | ---- | ---- | 22.61 | 3.19 | ---- | ---- | -| Test 3 | ---- | ---- | 24.59 | 5.16 | ---- | ---- | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | broken, writing | broken, ok only | completely bro- | -| | to wrong places | if bgcolor is | ken, monitor | -| | on screen + bug | black, bug in | switches off | -| | in fillrect() | fillrect() | | -+-----------+-----------------+-----------------+-----------------+ - - -+-----------+-----------------------------------------------------+ -| | not accelerated | -| VESAFB +-----------------+-----------------+-----------------+ -| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | 4.26 | 3.76 | 5.99 | 7.23 | ---- | ---- | -| Test 2 | 65.65 | 4.89 | 120.88 | 9.08 | ---- | ---- | -| Test 3 | 126.91 | 5.94 | 235.77 | 11.03 | ---- | ---- | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | vga=0x307 | vga=0x31a | vga=0x31b not | -| | fh=80kHz | fh=80kHz | supported by | -| | fv=75kHz | fv=75kHz | video BIOS and | -| | | | hardware | -+-----------+-----------------+-----------------+-----------------+ - - -+-----------+-----------------------------------------------------+ -| | accelerated | -| CYBLAFB +-----------------+-----------------+-----------------+ -| | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | 8.02 | 0.23 | 19.04 | 0.61 | 57.12 | 2.74 | -| Test 2 | 8.38 | 0.55 | 19.39 | 0.92 | 57.54 | 3.13 | -| Test 3 | 8.73 | 0.86 | 19.74 | 1.24 | 57.95 | 3.51 | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | | | | -| | | | | -| | | | | -| | | | | -+-----------+-----------------+-----------------+-----------------+ diff --git a/Documentation/fb/cyblafb/todo b/Documentation/fb/cyblafb/todo deleted file mode 100644 index c5f6d0eae545..000000000000 --- a/Documentation/fb/cyblafb/todo +++ /dev/null @@ -1,31 +0,0 @@ -TODO / Missing features -======================= - -Verify LCD stuff "stretch" and "center" options are - completely untested ... this code needs to be - verified. As I don't have access to such - hardware, please contact me if you are - willing run some tests. - -Interlaced video modes The reason that interleaved - modes are disabled is that I do not know - the meaning of the vertical interlace - parameter. Also the datasheet mentions a - bit d8 of a horizontal interlace parameter, - but nowhere the lower 8 bits. Please help - if you can. - -low-res double scan modes Who needs it? - -accelerated color blitting Who needs it? The console driver does use color - blitting for nothing but drawing the penguine, - everything else is done using color expanding - blitting of 1bpp character bitmaps. - -ioctls Who needs it? - -TV-out Will be done later. Use "vga= " at boot time - to set a suitable video mode. - -??? Feel free to contact me if you have any - feature requests diff --git a/Documentation/fb/cyblafb/usage b/Documentation/fb/cyblafb/usage deleted file mode 100644 index a39bb3d402a2..000000000000 --- a/Documentation/fb/cyblafb/usage +++ /dev/null @@ -1,217 +0,0 @@ -CyBlaFB is a framebuffer driver for the Cyberblade/i1 graphics core integrated -into the VIA Apollo PLE133 (aka vt8601) south bridge. It is developed and -tested using a VIA EPIA 5000 board. - -Cyblafb - compiled into the kernel or as a module? -================================================== - -You might compile cyblafb either as a module or compile it permanently into the -kernel. - -Unless you have a real reason to do so you should not compile both vesafb and -cyblafb permanently into the kernel. It's possible and it helps during the -developement cycle, but it's useless and will at least block some otherwise -usefull memory for ordinary users. - -Selecting Modes -=============== - - Startup Mode - ============ - - First of all, you might use the "vga=???" boot parameter as it is - documented in vesafb.txt and svga.txt. Cyblafb will detect the video - mode selected and will use the geometry and timings found by - inspecting the hardware registers. - - video=cyblafb vga=0x317 - - Alternatively you might use a combination of the mode, ref and bpp - parameters. If you compiled the driver into the kernel, add something - like this to the kernel command line: - - video=cyblafb:1280x1024,bpp=16,ref=50 ... - - If you compiled the driver as a module, the same mode would be - selected by the following command: - - modprobe cyblafb mode=1280x1024 bpp=16 ref=50 ... - - None of the modes possible to select as startup modes are affected by - the problems described at the end of the next subsection. - - For all startup modes cyblafb chooses a virtual x resolution of 2048, - the only exception is mode 1280x1024 in combination with 32 bpp. This - allows ywrap scrolling for all those modes if rotation is 0 or 2, and - also fast scrolling if rotation is 1 or 3. The default virtual y reso- - lution is 4096 for bpp == 8, 2048 for bpp==16 and 1024 for bpp == 32, - again with the only exception of 1280x1024 at 32 bpp. - - Please do set your video memory size to 8 Mb in the Bios setup. Other - values will work, but performace is decreased for a lot of modes. - - Mode changes using fbset - ======================== - - You might use fbset to change the video mode, see "man fbset". Cyblafb - generally does assume that you know what you are doing. But it does - some checks, especially those that are needed to prevent you from - damaging your hardware. - - - only 8, 16, 24 and 32 bpp video modes are accepted - - interlaced video modes are not accepted - - double scan video modes are not accepted - - if a flat panel is found, cyblafb does not allow you - to program a resolution higher than the physical - resolution of the flat panel monitor - - cyblafb does not allow vclk to exceed 230 MHz. As 32 bpp - and (currently) 24 bit modes use a doubled vclk internally, - the dotclock limit as seen by fbset is 115 MHz for those - modes and 230 MHz for 8 and 16 bpp modes. - - cyblafb will allow you to select very high resolutions as - long as the hardware can be programmed to these modes. The - documented limit 1600x1200 is not enforced, but don't expect - perfect signal quality. - - Any request that violates the rules given above will be either changed - to something the hardware supports or an error value will be returned. - - If you program a virtual y resolution higher than the hardware limit, - cyblafb will silently decrease that value to the highest possible - value. The same is true for a virtual x resolution that is not - supported by the hardware. Cyblafb tries to adapt vyres first because - vxres decides if ywrap scrolling is possible or not. - - Attempts to disable acceleration are ignored, I believe that this is - safe. - - Some video modes that should work do not work as expected. If you use - the standard fb.modes, fbset 640x480-60 will program that mode, but - you will see a vertical area, about two characters wide, with only - much darker characters than the other characters on the screen. - Cyblafb does allow that mode to be set, as it does not violate the - official specifications. It would need a lot of code to reliably sort - out all invalid modes, playing around with the margin values will - give a valid mode quickly. And if cyblafb would detect such an invalid - mode, should it silently alter the requested values or should it - report an error? Both options have some pros and cons. As stated - above, none of the startup modes are affected, and if you set - verbosity to 1 or higher, cyblafb will print the fbset command that - would be needed to program that mode using fbset. - - -Other Parameters -================ - - -crt don't autodetect, assume monitor connected to - standard VGA connector - -fp don't autodetect, assume flat panel display - connected to flat panel monitor interface - -nativex inform driver about native x resolution of - flat panel monitor connected to special - interface (should be autodetected) - -stretch stretch image to adapt low resolution modes to - higer resolutions of flat panel monitors - connected to special interface - -center center image to adapt low resolution modes to - higer resolutions of flat panel monitors - connected to special interface - -memsize use if autodetected memsize is wrong ... - should never be necessary - -nopcirr disable PCI read retry -nopciwr disable PCI write retry -nopcirb disable PCI read bursts -nopciwb disable PCI write bursts - -bpp bpp for specified modes - valid values: 8 || 16 || 24 || 32 - -ref refresh rate for specified mode - valid values: 50 <= ref <= 85 - -mode 640x480 or 800x600 or 1024x768 or 1280x1024 - if not specified, the startup mode will be detected - and used, so you might also use the vga=??? parameter - described in vesafb.txt. If you do not specify a mode, - bpp and ref parameters are ignored. - -verbosity 0 is the default, increase to at least 2 for every - bug report! - -Development hints -================= - -It's much faster do compile a module and to load the new version after -unloading the old module than to compile a new kernel and to reboot. So if you -try to work on cyblafb, it might be a good idea to use cyblafb as a module. -In real life, fast often means dangerous, and that's also the case here. If -you introduce a serious bug when cyblafb is compiled into the kernel, the -kernel will lock or oops with a high probability before the file system is -mounted, and the danger for your data is low. If you load a broken own version -of cyblafb on a running system, the danger for the integrity of the file -system is much higher as you might need a hard reset afterwards. Decide -yourself. - -Module unloading, the vfb method -================================ - -If you want to unload/reload cyblafb using the virtual framebuffer, you need -to enable vfb support in the kernel first. After that, load the modules as -shown below: - - modprobe vfb vfb_enable=1 - modprobe fbcon - modprobe cyblafb - fbset -fb /dev/fb1 1280x1024-60 -vyres 2662 - con2fb /dev/fb1 /dev/tty1 - ... - -If you now made some changes to cyblafb and want to reload it, you might do it -as show below: - - con2fb /dev/fb0 /dev/tty1 - ... - rmmod cyblafb - modprobe cyblafb - con2fb /dev/fb1 /dev/tty1 - ... - -Of course, you might choose another mode, and most certainly you also want to -map some other /dev/tty* to the real framebuffer device. You might also choose -to compile fbcon as a kernel module or place it permanently in the kernel. - -I do not know of any way to unload fbcon, and fbcon will prevent the -framebuffer device loaded first from unloading. [If there is a way, then -please add a description here!] - -Module unloading, the vesafb method -=================================== - -Configure the kernel: - - <*> Support for frame buffer devices - [*] VESA VGA graphics support - Cyberblade/i1 support - -Add e.g. "video=vesafb:ypan vga=0x307" to the kernel parameters. The ypan -parameter is important, choose any vga parameter you like as long as it is -a graphics mode. - -After booting, load cyblafb without any mode and bpp parameter and assign -cyblafb to individual ttys using con2fb, e.g.: - - modprobe cyblafb - con2fb /dev/fb1 /dev/tty1 - -Unloading cyblafb works without problems after you assign vesafb to all -ttys again, e.g.: - - con2fb /dev/fb0 /dev/tty1 - rmmod cyblafb diff --git a/Documentation/fb/cyblafb/whatsnew b/Documentation/fb/cyblafb/whatsnew deleted file mode 100644 index 76c07a26e044..000000000000 --- a/Documentation/fb/cyblafb/whatsnew +++ /dev/null @@ -1,29 +0,0 @@ -0.62 -==== - - - the vesafb parameter has been removed as I decided to allow the - feature without any special parameter. - - - Cyblafb does not use the vga style of panning any longer, now the - "right view" register in the graphics engine IO space is used. Without - that change it was impossible to use all available memory, and without - access to all available memory it is impossible to ywrap. - - - The imageblit function now uses hardware acceleration for all font - widths. Hardware blitting across pixel column 2048 is broken in the - cyberblade/i1 graphics core, but we work around that hardware bug. - - - modes with vxres != xres are supported now. - - - ywrap scrolling is supported now and the default. This is a big - performance gain. - - - default video modes use vyres > yres and vxres > xres to allow - almost optimal scrolling speed for normal and rotated screens - - - some features mainly usefull for debugging the upper layers of the - framebuffer system have been added, have a look at the code - - - fixed: Oops after unloading cyblafb when reading /proc/io* - - - we work around some bugs of the higher framebuffer layers. diff --git a/Documentation/fb/cyblafb/whycyblafb b/Documentation/fb/cyblafb/whycyblafb deleted file mode 100644 index a123bc11e698..000000000000 --- a/Documentation/fb/cyblafb/whycyblafb +++ /dev/null @@ -1,85 +0,0 @@ -I tried the following framebuffer drivers: - - - TRIDENTFB is full of bugs. Acceleration is broken for Blade3D - graphics cores like the cyberblade/i1. It claims to support a great - number of devices, but documentation for most of these devices is - unfortunately not available. There is _no_ reason to use tridentfb - for cyberblade/i1 + CRT users. VESAFB is faster, and the one - advantage, mode switching, is broken in tridentfb. - - - VESAFB is used by many distributions as a standard. Vesafb does - not support mode switching. VESAFB is a bit faster than the working - configurations of TRIDENTFB, but it is still too slow, even if you - use ypan. - - - EPIAFB (you'll find it on sourceforge) supports the Cyberblade/i1 - graphics core, but it still has serious bugs and developement seems - to have stopped. This is the one driver with TV-out support. If you - do need this feature, try epiafb. - -None of these drivers was a real option for me. - -I believe that is unreasonable to change code that announces to support 20 -devices if I only have more or less sufficient documentation for exactly one -of these. The risk of breaking device foo while fixing device bar is too high. - -So I decided to start CyBlaFB as a stripped down tridentfb. - -All code specific to other Trident chips has been removed. After that there -were a lot of cosmetic changes to increase the readability of the code. All -register names were changed to those mnemonics used in the datasheet. Function -and macro names were changed if they hindered easy understanding of the code. - -After that I debugged the code and implemented some new features. I'll try to -give a little summary of the main changes: - - - calculation of vertical and horizontal timings was fixed - - - video signal quality has been improved dramatically - - - acceleration: - - - fillrect and copyarea were fixed and reenabled - - - color expanding imageblit was newly implemented, color - imageblit (only used to draw the penguine) still uses the - generic code. - - - init of the acceleration engine was improved and moved to a - place where it really works ... - - - sync function has a timeout now and tries to reset and - reinit the accel engine if necessary - - - fewer slow copyarea calls when doing ypan scrolling by using - undocumented bit d21 of screen start address stored in - CR2B[5]. BIOS does use it also, so this should be safe. - - - cyblafb rejects any attempt to set modes that would cause vclk - values above reasonable 230 MHz. 32bit modes use a clock - multiplicator of 2, so fbset does show the correct values for - pixclock but not for vclk in this case. The fbset limit is 115 MHz - for 32 bpp modes. - - - cyblafb rejects modes known to be broken or unimplemented (all - interlaced modes, all doublescan modes for now) - - - cyblafb now works independant of the video mode in effect at startup - time (tridentfb does not init all needed registers to reasonable - values) - - - switching between video modes does work reliably now - - - the first video mode now is the one selected on startup using the - vga=???? mechanism or any of - - 640x480, 800x600, 1024x768, 1280x1024 - - 8, 16, 24 or 32 bpp - - refresh between 50 Hz and 85 Hz, 1 Hz steps (1280x1024-32 - is limited to 63Hz) - - - pci retry and pci burst mode are settable (try to disable if you - experience latency problems) - - - built as a module cyblafb might be unloaded and reloaded using - the vfb module and con2vt or might be used together with vesafb - diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index c19f6feb4e53..db7a4f42edad 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -1597,30 +1597,6 @@ config FB_VT8623 Driver for CastleRock integrated graphics core in the VIA VT8623 [Apollo CLE266] chipset. -config FB_CYBLA - tristate "Cyberblade/i1 support" - depends on FB && PCI && X86_32 && !64BIT - select FB_CFB_IMAGEBLIT - ---help--- - This driver is supposed to support the Trident Cyberblade/i1 - graphics core integrated in the VIA VT8601A North Bridge, - also known as VIA Apollo PLE133. - - Status: - - Developed, tested and working on EPIA 5000 and EPIA 800. - - Does work reliable on all systems with CRT/LCD connected to - normal VGA ports. - - Should work on systems that do use the internal LCD port, but - this is absolutely not tested. - - Character imageblit, copyarea and rectangle fill are hw accelerated, - ypan scrolling is used by default. - - Please do read . - - To compile this driver as a module, choose M here: the - module will be called cyblafb. - config FB_TRIDENT tristate "Trident support" depends on FB && PCI @@ -1633,8 +1609,8 @@ config FB_TRIDENT and Blade XP. There are also integrated versions of these chips called CyberXXXX, CyberImage or CyberBlade. These chips are mostly found in laptops - but also on some motherboards. For more information, read - + but also on some motherboards including early VIA EPIA motherboards. + For more information, read Say Y if you have such a graphics board. diff --git a/drivers/video/cyblafb.c b/drivers/video/cyblafb.c deleted file mode 100644 index 9704b73135f5..000000000000 --- a/drivers/video/cyblafb.c +++ /dev/null @@ -1,1683 +0,0 @@ -/* - * Frame buffer driver for Trident Cyberblade/i1 graphics core - * - * Copyright 2005 Knut Petersen - * - * CREDITS: - * tridentfb.c by Jani Monoses - * see files above for further credits - * - */ - -#define CYBLAFB_DEBUG 0 -#define CYBLAFB_KD_GRAPHICS_QUIRK 1 - -#define CYBLAFB_PIXMAPSIZE 8192 - -#include -#include -#include -#include -#include -#include -#include