Received: by oss.sgi.com id ; Tue, 20 Feb 2001 20:40:33 -0800 Received: from tux.mkp.net ([130.225.60.11]:2829 "EHLO tux.mkp.net") by oss.sgi.com with ESMTP id ; Tue, 20 Feb 2001 20:40:13 -0800 Received: from tux.mkp.net ([130.225.60.11] helo=jaguar.mkp.net) by tux.mkp.net with esmtp (Exim 3.16 #1) id 14VR4P-00038a-00; Wed, 21 Feb 2001 05:40:06 +0100 Received: (from mkp@localhost) by jaguar.mkp.net (8.9.3/8.9.3) id SAA06233; Tue, 20 Feb 2001 18:39:58 -0500 X-Authentication-Warning: jaguar.mkp.net: mkp set sender to mkp@mkp.net using -f To: Kalvinder Singh Cc: linux-xfs@oss.sgi.com Subject: Re: XFS and SW RAID References: <3A9343EA.2040404@oz.agile.tv> From: "Martin K. Petersen" Organization: Linuxcare, Inc. Date: 20 Feb 2001 18:39:58 -0500 In-Reply-To: <3A9343EA.2040404@oz.agile.tv> Message-ID: Lines: 23 User-Agent: Gnus/5.0808 (Gnus v5.8.8) XEmacs/21.1 (Canyonlands) MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="=-=-=" Sender: owner-linux-xfs@oss.sgi.com Precedence: bulk Return-Path: X-Orcpt: rfc822;linux-xfs-outgoing --=-=-= >>>>> "Kalvinder" == Kalvinder Singh writes: Kalvinder> I did read the FAQ, and from the sounds of it XFS will not Kalvinder> work with software RAID, however I went through the archive Kalvinder> and noticed that some people are working on it. Kalvinder> Is this correct? Kalvinder> And how much extra work needs to be done to get it working? We believe we have fixed the RAID5 corruption problems. However, the resync workaround hurts performance badly and I'll have to fix this properly. I've attached the current patch below. -- Martin K. Petersen, Principal Linux Consultant, Linuxcare, Inc. mkp@linuxcare.com, http://www.linuxcare.com/ SGI XFS for Linux Developer, http://oss.sgi.com/projects/xfs/ --=-=-= Content-Type: text/x-patch Content-Disposition: attachment; filename=raid5.patch =========================================================================== Index: linux/drivers/md/md.c =========================================================================== --- /usr/tmp/TmpDir.3925-0/linux/drivers/md/md.c_1.6 Thu Feb 15 21:45:48 2001 +++ linux/drivers/md/md.c Thu Feb 15 13:46:59 2001 @@ -2033,65 +2033,69 @@ struct { int set; int noautodetect; -} raid_setup_args md__initdata; -void md_setup_drive (void) md__init; +} raid_setup_args md__initdata = { 0, 0 }; + +void md_setup_drive(void) md__init; /* * Searches all registered partitions for autorun RAID arrays * at boot time. */ -static int detected_devices[128] md__initdata; -static int dev_cnt; - +#define CONFIG_AUTODETECT_RAID +#ifdef CONFIG_AUTODETECT_RAID +static int detected_devices[128] md__initdata = { 0, }; +static int dev_cnt=0; void md_autodetect_dev(kdev_t dev) { if (dev_cnt >= 0 && dev_cnt < 127) detected_devices[dev_cnt++] = dev; } +#endif - -static void autostart_arrays (void) +int md__init md_run_setup(void) { +#ifdef CONFIG_AUTODETECT_RAID mdk_rdev_t *rdev; int i; - printk(KERN_INFO "autodetecting RAID arrays\n"); + if (raid_setup_args.noautodetect) + printk(KERN_INFO "skipping autodetection of RAID arrays\n"); + else { - for (i=0; ifaulty) { - MD_BUG(); - continue; + for (i=0; ifaulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); } - md_list_add(&rdev->pending, &pending_raid_disks); - } - autorun_devices(-1); -} + autorun_devices(-1); + } -int md__init md_run_setup(void) -{ - if (raid_setup_args.noautodetect) - printk(KERN_INFO "skipping autodetection of RAID arrays\n"); - else - autostart_arrays(); dev_cnt = -1; /* make sure further calls to md_autodetect_dev are ignored */ +#endif +#ifdef CONFIG_MD_BOOT md_setup_drive(); +#endif return 0; } @@ -2555,11 +2559,6 @@ md_print_devices(); goto done_unlock; - case RAID_AUTORUN: - err = 0; - autostart_arrays(); - goto done; - case BLKGETSIZE: /* Return device size */ if (!arg) { err = -EINVAL; @@ -2713,6 +2712,7 @@ case BLKSETSIZE: set_blocksize (mddev, (int *) arg); goto done_unlock; + /* * We have a problem here : there is no easy way to give a CHS @@ -3056,9 +3056,11 @@ int sz = 0; unsigned long max_blocks, resync, res, dt, db, rt; - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)) >> 1; max_blocks = mddev->sb->size; + printk ("Res: %ld, Max Blocks: %ld\n", resync, max_blocks); + /* * Should not happen. */ @@ -3103,7 +3105,7 @@ if (!dt) dt++; db = resync - mddev->resync_mark_cnt; rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; - + sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); sz += sprintf(page + sz, " speed=%ldK/sec", db/dt); @@ -3274,10 +3276,10 @@ MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); -void md_done_sync(mddev_t *mddev, int blocks, int ok) +void md_done_sync(mddev_t *mddev, int sectors, int ok) { - /* another "blocks" (1K) blocks have been synced */ - atomic_sub(blocks, &mddev->recovery_active); + /* another chunk of sectors has been synced */ + atomic_sub(sectors, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { // stop recovery, signal do_sync .... @@ -3289,7 +3291,7 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) { mddev_t *mddev2; - unsigned int max_blocks, currspeed, + unsigned int max_sectors, currspeed, j, window, err, serialize; kdev_t read_disk = mddev_to_kdev(mddev); unsigned long mark[SYNC_MARKS]; @@ -3326,7 +3328,7 @@ mddev->curr_resync = 1; - max_blocks = mddev->sb->size; + max_sectors = mddev->sb->size << 1; printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", @@ -3351,22 +3353,22 @@ * Tune reconstruction: */ window = MAX_READAHEAD*(PAGE_SIZE/1024); - printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks); + printk(KERN_INFO "md: using %dk window, over a total of %d sectors.\n",window, max_sectors); atomic_set(&mddev->recovery_active, 0); init_waitqueue_head(&mddev->recovery_wait); last_check = 0; - for (j = 0; j < max_blocks;) { - int blocks; + for (j = 0; j < max_sectors;) { + int sectors; - blocks = mddev->pers->sync_request(mddev, j); + sectors = mddev->pers->sync_request(mddev, j); - if (blocks < 0) { - err = blocks; + if (sectors < 0) { + err = sectors; goto out; } - atomic_add(blocks, &mddev->recovery_active); - j += blocks; + atomic_add(sectors, &mddev->recovery_active); + j += sectors; mddev->curr_resync = j; if (last_check + window > j) @@ -3384,7 +3386,7 @@ mark_cnt[next] = j - atomic_read(&mddev->recovery_active); last_mark = next; } - + if (md_signal_pending(current)) { /* @@ -3626,7 +3628,7 @@ &md_fops, NULL); /* forward all md request to md_make_request */ - blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request); + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), (void *) md_make_request); read_ahead[MAJOR_NR] = INT_MAX; @@ -3645,12 +3647,14 @@ return (0); } -static struct { - char device_set [MAX_MD_DEVS]; - int pers[MAX_MD_DEVS]; - int chunk[MAX_MD_DEVS]; - kdev_t devices[MAX_MD_DEVS][MD_SB_DISKS]; -} md_setup_args md__initdata; +#ifdef CONFIG_MD_BOOT +#define MAX_MD_BOOT_DEVS 8 +struct { + unsigned long set; + int pers[MAX_MD_BOOT_DEVS]; + int chunk[MAX_MD_BOOT_DEVS]; + kdev_t devices[MAX_MD_BOOT_DEVS][MD_SB_DISKS]; +} md_setup_args md__initdata = { 0, }; /* * Parse the command-line parameters given our kernel, but do not @@ -3680,10 +3684,10 @@ printk("md: Too few arguments supplied to md=.\n"); return 0; } - if (minor >= MAX_MD_DEVS) { + if (minor >= MAX_MD_BOOT_DEVS) { printk ("md: Minor device number too high.\n"); return 0; - } else if (md_setup_args.device_set[minor]) { + } else if (md_setup_args.set & (1 << minor)) { printk ("md: Warning - md=%d,... has been specified twice;\n" " will discard the first definition.\n", minor); } @@ -3741,7 +3745,7 @@ printk ("md: Will configure md%d (%s) from %s, below.\n", minor, pername, devnames); md_setup_args.devices[minor][i] = (kdev_t) 0; - md_setup_args.device_set[minor] = 1; + md_setup_args.set |= (1 << minor); return 1; } @@ -3751,11 +3755,10 @@ kdev_t dev; mddev_t*mddev; - for (minor = 0; minor < MAX_MD_DEVS; minor++) { + for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) { mdu_disk_info_t dinfo; - - int err = 0; - if (!md_setup_args.device_set[minor]) + int err=0; + if (!(md_setup_args.set & (1 << minor))) continue; printk("md: Loading md%d.\n", minor); if (mddev_map[minor].mddev) { @@ -3781,7 +3784,7 @@ ainfo.layout = 0; ainfo.chunk_size = md_setup_args.chunk[minor]; err = set_array_info(mddev, &ainfo); - for (i = 0; !err && (dev = md_setup_args.devices[minor][i]); i++) { + for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) { dinfo.number = i; dinfo.raid_disk = i; dinfo.state = (1< %d\n", oldsize, size); + printk("raid5: conf->buffer_size = %d\n", conf->buffer_size); shrink_stripe_cache(conf); if (size==0) BUG(); conf->buffer_size = size; @@ -714,16 +715,19 @@ break; } spin_unlock_irq(&conf->device_lock); - if (count>1) { - xor_block(count, bh_ptr); - count = 1; - } - + if (count>1) { + xor_block(count, bh_ptr); + count = 1; + } for (i = disks; i--;) if (chosen[i]) { struct buffer_head *bh = sh->bh_cache[i]; char *bdata; - mark_buffer_clean(chosen[i]); /* NO FIXME */ + if(!(test_bit(BH_End_io, &(chosen[i]->b_state)) + || chosen[i]->b_next_free == NULL + || chosen[i]->b_prev_free == NULL )){ + mark_buffer_clean(chosen[i]); + } bdata = bh_kmap(chosen[i]); memcpy(bh->b_data, bdata,sh->size); @@ -888,7 +892,7 @@ } spin_unlock_irq(&conf->device_lock); if (syncing) { - md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,0); + md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone, 0); clear_bit(STRIPE_SYNCING, &sh->state); syncing = 0; } @@ -1063,7 +1067,7 @@ } } if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); + md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone, 1); clear_bit(STRIPE_SYNCING, &sh->state); } @@ -1159,13 +1163,13 @@ return correct_size; } -static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr) +static int raid5_sync_request (mddev_t *mddev, unsigned long sector) { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; int sectors_per_chunk = conf->chunk_size >> 9; - unsigned long stripe = (block_nr<<1)/sectors_per_chunk; - int chunk_offset = (block_nr<<1) % sectors_per_chunk; + unsigned long stripe = sector/sectors_per_chunk; + int chunk_offset = sector % sectors_per_chunk; int dd_idx, pd_idx; unsigned long first_sector; int raid_disks = conf->raid_disks; @@ -1173,9 +1177,9 @@ int redone = 0; int bufsize; - sh = get_active_stripe(conf, block_nr<<1, 0, 0); + sh = get_active_stripe(conf, sector, 0, 0); bufsize = sh->size; - redone = block_nr-(sh->sector>>1); + redone = sector - sh->sector; first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); sh->pd_idx = pd_idx; @@ -1188,7 +1192,7 @@ handle_stripe(sh); release_stripe(sh); - return (bufsize>>10)-redone; + return (bufsize >> 9) - redone; } /* --=-=-=--