checkpoint: Mostly working

This seems to work except for I/O timing.  Reads are sync and
writes are async, so this sequence fails:
  - Object x flushes
  - Object x is reused as y
  - Another object is taken as x
  - New object x reads
  -> Stale data is read

Two potential solutions from here:

1. Implement async reads.

2. Hold ref to object until write completes.

(1) is complicated, but more correct.  Writes may stay in buffers for
quite some time (typically 5 seconds), during which time the dm-compress
object cannot be released.
This commit is contained in:
Tom Marshall 2019-10-21 19:39:27 -07:00
parent 8ff29f0262
commit 495d191d16
12 changed files with 2958 additions and 1410 deletions

View File

@ -84,7 +84,12 @@ LIB_SRCS := \
BIN_SRCS := \
cbd.c
KMOD_SRCS := \
dm-compress.c
util.c \
pbat.c \
lbatpage.c \
lbatview.c \
lbd.c \
compress.c
# Intermediates
LIB_OBJDIR := $(OUT_OBJ)/$(LIB_NAME)
@ -151,7 +156,7 @@ $(BIN_OBJS): $(BIN_OBJDIR)/%.o: $(BIN_NAME)/%.c
$(OUT_BIN)/$(BIN_NAME)$(EXE_EXT): $(BIN_OBJS) $(OUT_LIB)/$(LIB_NAME)$(LIB_EXT)
$(link-executable)
$(KMOD_NAME)/$(KMOD_NAME)$(KMOD_EXT): $(KMOD_NAME)/$(KMOD_NAME).c
$(KMOD_NAME)/$(KMOD_NAME)$(KMOD_EXT): $(addprefix $(KMOD_NAME)/,$(KMOD_SRCS))
make -C $(KDIR) M=$(TOP)/$(KMOD_NAME) modules
$(OUT_KMOD)/$(KMOD_NAME)$(KMOD_EXT): $(KMOD_NAME)/$(KMOD_NAME)$(KMOD_EXT)

44
TODO
View File

@ -1,3 +1,45 @@
In lbd, atomic writes will require a pblk array in the lbd object. Not sure
how to roll back partial allocations yet but it should be doable.
For async reads:
- lbd_read() is called by compress_read() and compress_write().
lbd may have multiple simultaneous callers.
lbd calls lbatview_read() and reads its own data.
- lbatview_read() is called by lbd.
lbatview may have multiple simultaneous callers.
lbatview calls pbat_read() and reads its own data.
- pbat_read() is called by lbatview_alloc_pblk() and lbatview_free_pblk().
pbat may have multiple simultaneous callers.
pbat calls pbat_read().
Rework cache ownership:
- compress_open() should alloc only lbdcache.
- lbdcache should alloc only lbatviewcache.
- lbatviewcache should alloc lbatpagecache and pbatcache.
Cache object sizing:
- lbdcache size: multiple of num_online_cpus().
- lbatviewcache:
Min: one.
Max: one per lbd.
Avg: 1/2 lbdcache size.
=> Alloc 1/2 lbdcache size.
- lbpatpage cache:
Min: lbatviewcache size.
Max: 2 * lbatviewcache size.
Avg: 1.5 * lbatviewcache size.
=> alloc 1.5 * lbatviewcache size.
- pbatcache size:
Min: 1
Max: lbatviewcache size.
Avg: 1/2 lbatviewcache size.
=> alloc ???
1/2 lbatviewcache size is way too large.
Ratio of lbatview to pbat is 1:lbat_per_pbat.
Cache objects should dynamically expand.
TODO:
- Move back to module based build system.
- Make compression algorithm and speed/level selectable.
@ -8,6 +50,4 @@ TODO:
- Compressed device must be large enough.
- Backing device must be large enough.
- Remove workqueue.
- (?) Write / flush once per second. What about sync?
- (?) Don't cache lblk_alloc.
- (?) Function ptrs for reading and writing lblk_alloc.

View File

@ -1,5 +1,13 @@
# Makefile for dm-compress kernel module
dm-compress-y += \
util.o \
pbat.o \
lbatpage.o \
lbatview.o \
lbd.o \
compress.o
obj-m += dm-compress.o
ccflags-y := -I$(M)/../include

485
dm-compress/compress.c Normal file
View File

@ -0,0 +1,485 @@
/*
* Copyright (c) 2019 Tom Marshall <tdm.code@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/device-mapper.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/lz4.h>
#include <linux/dm-compress.h>
// XXX: find a better name for this, something about storage vs. speed.
// XXX: should this be in cbd_params?
// #define CBD_DETECT_ZERO_BLOCKS
/*
* XXX
* If we don't use a workqueue, pblk_read() stalls. Why?
*/
#define USE_WORKQUEUE 1
struct compress;
/* per bio private data */
struct compress_io {
struct compress* c;
struct bio* bio;
struct work_struct work;
};
struct compress
{
struct dm_dev* dev;
struct cbd_params params;
struct lbdcache* lc;
struct mutex io_mutex;
struct workqueue_struct* io_workq;
bool io_failed;
};
static inline u64
blkdev_pblk_size(struct block_device *bdev)
{
return i_size_read(bdev->bd_inode) >> PBLK_SHIFT;
}
static inline u64
dm_target_pblk_size(struct dm_target* ti)
{
return ti->len >> (PBLK_SHIFT - SECTOR_SHIFT);
}
/**************************************
* Main functions
**************************************/
static int
compress_open(struct compress* c, u64 dev_nr_pblks)
{
int err;
struct page* pblkpage;
u8 *pblkbuf;
struct page* iopagev[1];
struct cbd_header header;
u64 max_nr_zones;
pblkpage = cbd_alloc_page();
if (!pblkpage) {
return -ENOMEM;
}
pblkbuf = page_address(pblkpage);
iopagev[0] = pblkpage;
memset(&header, 0, sizeof(header));
header.params.priv = c->dev->bdev;
err = pblk_read_wait(&header.params, 0, 1, iopagev);
if (err) {
printk(KERN_ERR "%s: failed to read header\n", __func__);
cbd_free_page(pblkpage);
return err;
}
cbd_header_get(pblkbuf, &header);
cbd_free_page(pblkpage);
if (memcmp(header.magic, CBD_MAGIC, sizeof(header.magic)) != 0) {
printk(KERN_ERR "%s: bad magic\n", __func__);
err = -EINVAL;
goto out;
}
if (header.version_major != CBD_VERSION_MAJOR) {
printk(KERN_ERR "%s: bad version\n", __func__);
err = -EINVAL;
goto out;
}
if (header.version_minor != CBD_VERSION_MINOR) {
printk(KERN_ERR "%s: bad version\n", __func__);
err = -EINVAL;
goto out;
}
if (header.params.lblk_shift < LBLK_SHIFT_MIN ||
header.params.lblk_shift > LBLK_SHIFT_MAX) {
printk(KERN_ERR "%s: bad lblk_shift\n", __func__);
err = -EINVAL;
goto out;
}
/* XXX: validate minumum pblk using zone_off(max_zone+1) */
if (header.params.nr_pblk > dev_nr_pblks) {
printk(KERN_ERR "%s: bad nr_pblk\n", __func__);
err = -EINVAL;
goto out;
}
max_nr_zones = (dev_nr_pblks - CBD_HEADER_BLOCKS) / zone_len(&header.params);
if (header.params.nr_zones > max_nr_zones) {
printk(KERN_ERR "%s: bad nr_zones\n", __func__);
err = -EINVAL;
goto out;
}
/* XXX: validate lblk_per_zone */
printk(KERN_INFO "%s: parameters...\n", __func__);
printk(KERN_INFO " algorithm=%hu\n", (unsigned short)header.params.algorithm);
printk(KERN_INFO " compression=%hu\n", (unsigned short)header.params.compression);
printk(KERN_INFO " lblk_shift=%hu\n", (unsigned short)header.params.lblk_shift);
printk(KERN_INFO " nr_pblk=%lu\n", (unsigned long)header.params.nr_pblk);
printk(KERN_INFO " nr_zones=%u\n", (unsigned int)header.params.nr_zones);
printk(KERN_INFO " lblk_per_zone=%u\n", (unsigned int)header.params.lblk_per_zone);
memcpy(&c->params, &header.params, sizeof(header.params));
c->lc = kmalloc(lbdcache_size(), GFP_KERNEL);
if (!c->lc) {
err = -ENOMEM;
printk(KERN_ERR "Failed to alloc lbdcache\n");
goto out;
}
if (!lbdcache_ctr(c->lc, &c->params)) {
err = -ENOMEM;
printk(KERN_ERR "Failed to init logical block cache\n");
goto out;
}
mutex_init(&c->io_mutex);
c->io_workq = alloc_workqueue("compress_io", WQ_HIGHPRI | WQ_MEM_RECLAIM, 1);
if (!c->io_workq) {
printk(KERN_ERR "%s: failed to alloc io_workq\n", __func__);
err = -ENOMEM;
goto out;
}
c->io_failed = false;
out:
/* XXX: cleanup on error */
return err;
}
static struct lbd*
compress_lbdcache_swap(struct compress* c, u64 lblk, struct lbd* oldlbd)
{
struct lbd* lbd;
/* Get new data before putting old data to avoid flush */
lbd = lbdcache_get(c->lc, lblk);
if (!lbd) {
printk(KERN_ERR "%s: lbdcache_get failed\n", __func__);
lbdcache_put(c->lc, oldlbd);
return NULL;
}
if (lbd_read(lbd) != 0) {
printk(KERN_ERR "%s: lbd_read failed\n", __func__);
lbdcache_put(c->lc, lbd);
lbdcache_put(c->lc, oldlbd);
return NULL;
}
if (lbdcache_put(c->lc, oldlbd) != 0) {
printk(KERN_ERR "%s: failed to put oldlbd\n", __func__);
lbdcache_put(c->lc, lbd);
return NULL;
}
return lbd;
}
static int
compress_read(struct compress *c, struct bio *bio)
{
struct lbd* lbd = NULL;
struct bio_vec bv;
struct bvec_iter iter;
int ret;
u32 lblk_per_sector = lblk_per_pblk(&c->params) * PBLK_PER_SECTOR;
u64 last_lblk = LBLK_NONE;
bio_for_each_segment(bv, bio, iter) {
sector_t lblk = iter.bi_sector / lblk_per_sector;
u32 lblk_off = (iter.bi_sector - lblk * lblk_per_sector) * SECTOR_SIZE;
unsigned long flags;
char* data;
if (lblk != last_lblk) {
lbd = compress_lbdcache_swap(c, lblk, lbd);
if (!lbd) {
return -EIO;
}
last_lblk = lblk;
}
data = bvec_kmap_irq(&bv, &flags);
lbd_data_read(lbd, lblk_off, bv.bv_len, data);
bvec_kunmap_irq(data, &flags);
}
ret = lbdcache_put(c->lc, lbd);
return ret;
}
static int
compress_write(struct compress *c, struct bio *bio)
{
struct lbd* lbd = NULL;
struct bio_vec bv;
struct bvec_iter iter;
int ret;
u32 lblk_per_sector = lblk_per_pblk(&c->params) * PBLK_PER_SECTOR;
u64 last_lblk = LBLK_NONE;
bio_for_each_segment(bv, bio, iter) {
sector_t lblk = iter.bi_sector / lblk_per_sector;
u32 lblk_off = (iter.bi_sector - lblk * lblk_per_sector) * SECTOR_SIZE;
unsigned long flags;
char* data;
if (lblk != last_lblk) {
lbd = compress_lbdcache_swap(c, lblk, lbd);
if (!lbd) {
return -EIO;
}
last_lblk = lblk;
}
data = bvec_kmap_irq(&bv, &flags);
lbd_data_write(lbd, lblk_off, bv.bv_len, data);
bvec_kunmap_irq(data, &flags);
}
ret = lbdcache_put(c->lc, lbd);
return ret;
}
static void
compress_io(struct compress_io* cio)
{
int ret;
struct compress* c = cio->c;
struct bio* bio = cio->bio;
mutex_lock(&c->io_mutex);
switch (bio_op(bio)) {
case REQ_OP_READ:
ret = compress_read(c, bio);
break;
case REQ_OP_WRITE:
ret = compress_write(c, bio);
break;
default:
printk(KERN_ERR "%s: unknown op in bio: %u\n", __func__, bio_op(bio));
ret = -EINVAL;
}
mutex_unlock(&c->io_mutex);
if (ret) {
printk(KERN_ERR "%s: failed, ret=%d\n", __func__, ret);
}
bio->bi_status = (ret == 0 ? BLK_STS_OK : BLK_STS_IOERR); /* XXX */
bio_endio(bio);
}
#ifdef USE_WORKQUEUE
static void
compress_io_work(struct work_struct* work)
{
struct compress_io* cio = container_of(work, struct compress_io, work);
compress_io(cio);
}
#endif
/*
* Usage:
* echo "<start_sector> <end_sector> compress <backing_device> <args...>" | dmsetup create <compress_name>
* Where:
* start_sector is the starting sector of the backing device.
* end_sector is the ending sector of the backing device.
* compress is the name of this module.
* backing_device is the name backing device.
* args is:
* create [lblk_shift=#]
* open
* compress_name is the name of the compress device.
*/
static int
compress_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
int err;
unsigned int argn;
struct compress *c = NULL;
u64 dev_nr_pblks;
printk(KERN_INFO "%s: enter: argc=%u\n", __func__, argc);
for (argn = 0; argn < argc; ++argn) {
printk(KERN_INFO " ... arg[%u]=\"%s\"\n", argn, argv[argn]);
}
if (argc == 0) {
ti->error = "No device specified";
return -EINVAL;
}
argn = 1;
while (argn < argc) {
const char* arg = argv[argn++];
const char* eq = strchr(arg, '=');
if (!eq) {
ti->error = "Invalid argument format";
return -EINVAL;
}
#if 0
if (!memcmp(arg, "verbose", 7)) {
err = kstrtouint(eq + 1, 0, &verbose_level);
if (err) {
ti->error = "Failed to parse verbose";
return -EINVAL;
}
continue;
}
#endif
ti->error = "Unrecognized argument";
return -EINVAL;
}
c = kzalloc(sizeof(struct compress), GFP_KERNEL);
if (!c) {
ti->error = "Failed to allocate target";
return -ENOMEM;
}
if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev)) {
ti->error = "Device lookup failed";
kfree(c);
return -EINVAL;
}
ti->private = c;
dev_nr_pblks = dm_target_pblk_size(ti);
if (get_order(dev_nr_pblks) >= 48) {
ti->error = "Device too large";
kfree(c);
return -EINVAL;
}
ti->per_io_data_size = ALIGN(sizeof(struct compress_io), ARCH_KMALLOC_MINALIGN);
err = compress_open(c, dev_nr_pblks);
if (err) {
dm_put_device(ti, c->dev);
kfree(c);
return err;
}
printk(KERN_INFO "%s: success\n", __func__);
return 0;
}
static void
compress_dtr(struct dm_target *ti)
{
struct compress *c;
printk(KERN_INFO "%s: enter\n", __func__);
c = ti->private;
lbdcache_dtr(c->lc);
kfree(c->lc);
if (c->io_workq) {
destroy_workqueue(c->io_workq);
}
dm_put_device(ti, c->dev);
kfree(c);
}
static int
compress_map(struct dm_target *ti, struct bio *bio)
{
struct compress *c = ti->private;
struct compress_io *cio;
if (c->io_failed) {
return DM_MAPIO_KILL;
}
/* from dm-crypt.c */
if (unlikely(bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)) {
bio_set_dev(bio, c->dev->bdev);
if (bio_sectors(bio)) {
/* XXX: remap to underlying data */
}
return DM_MAPIO_REMAPPED;
}
/* Synchronous I/O operations deadlock, so queue them. */
/* XXX: clone the bio? */
cio = dm_per_bio_data(bio, ti->per_io_data_size);
cio->c = c;
cio->bio = bio;
#ifdef USE_WORKQUEUE
INIT_WORK(&cio->work, compress_io_work);
queue_work(c->io_workq, &cio->work);
#else
compress_io(io);
#endif
return DM_MAPIO_SUBMITTED;
}
static struct target_type compress_target = {
.name = "compress",
.version = { 1, 0, 0 },
.module = THIS_MODULE,
.ctr = compress_ctr,
.dtr = compress_dtr,
.map = compress_map,
};
static int __init
dm_compress_init(void)
{
int res;
res = dm_register_target(&compress_target);
if (res < 0) {
printk(KERN_ERR "Failed to register dm-compress: %d\n", res);
}
return res;
}
static void __exit
dm_compress_exit(void)
{
dm_unregister_target(&compress_target);
}
module_init(dm_compress_init);
module_exit(dm_compress_exit);
MODULE_DESCRIPTION("compress target for transparent compression");
MODULE_AUTHOR("Tom Marshall <tdm.code@gmail.com>");
MODULE_LICENSE("GPL");
MODULE_VERSION("1.0");

File diff suppressed because it is too large Load Diff

315
dm-compress/lbatpage.c Normal file
View File

@ -0,0 +1,315 @@
/*
* Copyright (c) 2019 Tom Marshall <tdm.code@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/device-mapper.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/lz4.h>
#include <linux/dm-compress.h>
struct lbatpage {
u64 pblk;
struct mutex reflock;
unsigned int ref;
struct mutex lock;
enum cache_state state;
struct cbd_params* params;
struct page* page;
u8* buf;
bool dirty;
};
bool
lbatpage_ctr(struct lbatpage* lp, struct cbd_params* params)
{
lp->pblk = PBLK_NONE;
mutex_init(&lp->reflock);
lp->ref = 0;
mutex_init(&lp->lock);
lp->state = CACHE_STATE_UNCACHED;
lp->params = params;
lp->page = cbd_alloc_page();
if (!lp->page) {
return false;
}
lp->buf = page_address(lp->page);
lp->dirty = false;
return true;
}
void
lbatpage_dtr(struct lbatpage* lp)
{
lp->buf = NULL;
cbd_free_page(lp->page);
lp->page = NULL;
}
static void
lbatpage_flush_endio(struct bio* bio)
{
int ret;
cbd_free_page(bio->bi_io_vec[0].bv_page);
ret = pblk_endio(bio);
if (ret) {
/* XXX: ...? */
printk(KERN_ERR "%s: I/O failed\n", __func__);
}
}
int
lbatpage_flush(struct lbatpage* lp)
{
int ret = 0;
struct page* iopagev[1];
mutex_lock(&lp->lock);
if (lp->state != CACHE_STATE_DIRTY) {
goto out;
}
iopagev[0] = cbd_alloc_page();
if (!iopagev[0]) {
printk(KERN_ERR "%s: out of memory\n", __func__);
ret = -ENOMEM;
goto out;
}
memcpy(page_address(iopagev[0]), lp->buf, PAGE_SIZE);
pblk_write(lp->params, lp->pblk, 1, iopagev, lbatpage_flush_endio, lp);
lp->state = CACHE_STATE_CLEAN;
out:
mutex_unlock(&lp->lock);
return ret;
}
int
lbatpage_read(struct lbatpage* lp)
{
int ret = 0;
struct page* pagev[1];
ret = lbatpage_flush(lp);
if (ret) {
return ret;
}
mutex_lock(&lp->lock);
if (lp->state == CACHE_STATE_CLEAN) {
goto out;
}
pagev[0] = lp->page;
ret = pblk_read_wait(lp->params, lp->pblk, 1, pagev);
if (ret) {
goto out;
}
lp->state = CACHE_STATE_CLEAN;
out:
mutex_unlock(&lp->lock);
return ret;
}
void
lbatpage_reset(struct lbatpage* lp, u64 pblk)
{
BUG_ON(lp->pblk == pblk);
lp->pblk = pblk;
lp->state = CACHE_STATE_UNCACHED;
}
u8*
lbatpage_get_buf(struct lbatpage* lp, bool rw)
{
mutex_lock(&lp->lock);
if (rw) {
lp->state = CACHE_STATE_DIRTY;
}
return lp->buf;
}
void
lbatpage_put_buf(struct lbatpage* lp)
{
mutex_unlock(&lp->lock);
}
struct lbatpagecache {
struct mutex lock;
struct cbd_params* params;
unsigned int len;
struct lbatpage** cache;
};
size_t
lbatpagecache_size(void)
{
return sizeof(struct lbatpagecache);
}
static bool
lbatpagecache_realloc(struct lbatpagecache* lpc, unsigned int len)
{
struct lbatpage** cache;
unsigned int n;
struct lbatpage* lp;
cache = kzalloc(len * sizeof(struct lbatpage*), GFP_KERNEL);
if (!cache) {
return false;
}
n = 0;
if (lpc->len) {
memcpy(cache, lpc->cache, lpc->len * sizeof(struct lbatpage*));
n = lpc->len;
kfree(lpc->cache);
}
lpc->len = len;
lpc->cache = cache;
while (n < len) {
lp = kmalloc(sizeof(struct lbatpage), GFP_KERNEL);
if (!lp) {
return false;
}
cache[n++] = lp;
if (!lbatpage_ctr(lp, lpc->params)) {
return false;
}
}
return true;
}
bool
lbatpagecache_ctr(struct lbatpagecache* lpc,
struct cbd_params* params)
{
memset(lpc, 0, sizeof(struct lbatpagecache));
mutex_init(&lpc->lock);
lpc->params = params;
return lbatpagecache_realloc(lpc, 1);
}
void
lbatpagecache_dtr(struct lbatpagecache* lpc)
{
unsigned int n;
struct lbatpage* lp;
for (n = 0; n < lpc->len; ++n) {
lp = lpc->cache[n];
if (!lp) {
continue;
}
lbatpage_dtr(lp);
if (lp->ref) {
printk(KERN_ERR "%s: lbatpage ref leak: n=%u ref=%u\n", __func__, n, lp->ref);
}
kfree(lp);
}
kfree(lpc->cache);
lpc->cache = NULL;
lpc->len = 0;
lpc->params = NULL;
}
struct lbatpage*
lbatpagecache_get(struct lbatpagecache* lpc, u64 pblk)
{
unsigned int n;
struct lbatpage* lp;
mutex_lock(&lpc->lock);
for (n = 0; n < lpc->len; ++n) {
lp = lpc->cache[n];
mutex_lock(&lp->reflock);
if (lp->pblk == pblk) {
++lp->ref;
mutex_unlock(&lp->reflock);
goto out;
}
mutex_unlock(&lp->reflock);
}
for (n = 0; n < lpc->len; ++n) {
lp = lpc->cache[n];
mutex_lock(&lp->reflock);
if (lp->pblk == PBLK_NONE) {
goto found;
}
mutex_unlock(&lp->reflock);
}
for (n = 0; n < lpc->len; ++n) {
lp = lpc->cache[n];
mutex_lock(&lp->reflock);
if (lp->ref == 0) {
goto found;
}
mutex_unlock(&lp->reflock);
}
printk(KERN_INFO "%s: all pages in use, realloc...\n", __func__);
n = lpc->len;
if (!lbatpagecache_realloc(lpc, lpc->len * 2)) {
printk(KERN_ERR "%s: realloc failed\n", __func__);
lp = NULL;
goto out;
}
printk(KERN_INFO "%s: realloc done, using n=%u\n", __func__, n);
lp = lpc->cache[n];
mutex_lock(&lp->reflock);
found:
lbatpage_reset(lp, pblk);
lp->ref = 1;
mutex_unlock(&lp->reflock);
out:
mutex_unlock(&lpc->lock);
return lp;
}
int
lbatpagecache_put(struct lbatpagecache* lpc, struct lbatpage* lp)
{
int ret = 0;
if (!lp) {
return 0;
}
mutex_lock(&lpc->lock);
mutex_lock(&lp->reflock);
if (--lp->ref == 0) {
ret = lbatpage_flush(lp);
if (ret) {
printk(KERN_ERR "%s: lbatpage_flush failed\n", __func__);
}
}
mutex_unlock(&lp->reflock);
mutex_unlock(&lpc->lock);
return ret;
}

681
dm-compress/lbatview.c Normal file
View File

@ -0,0 +1,681 @@
/*
* Copyright (c) 2019 Tom Marshall <tdm.code@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/device-mapper.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/lz4.h>
#include <linux/dm-compress.h>
struct lbatview {
u64 pblk;
struct mutex reflock;
unsigned int ref;
struct mutex lock;
enum cache_state state;
struct cbd_params* params;
struct pbatcache* pbatcache;
struct pbat* pbat;
struct lbatpagecache* lpc;
struct lbatpage* pages[2];
};
bool
lbatview_ctr(struct lbatview* lv,
struct cbd_params* params,
struct pbatcache* pbatcache,
struct lbatpagecache* lpc)
{
memset(lv, 0, sizeof(struct lbatview));
lv->pblk = PBLK_NONE;
mutex_init(&lv->reflock);
lv->ref = 0;
mutex_init(&lv->lock);
lv->state = CACHE_STATE_UNCACHED;
lv->params = params;
lv->pbatcache = pbatcache;
lv->pbat = NULL;
lv->lpc = lpc;
lv->pages[0] = lv->pages[1] = NULL;
return true;
}
void
lbatview_dtr(struct lbatview* lv)
{
if (pbatcache_put(lv->pbatcache, lv->pbat) != 0) {
printk(KERN_ERR "%s: pbatcache_put failed\n", __func__);
}
lv->pbat = NULL;
lbatpagecache_put(lv->lpc, lv->pages[0]);
lbatpagecache_put(lv->lpc, lv->pages[1]);
lv->pages[0] = lv->pages[1] = NULL;
lv->lpc = NULL;
}
int
lbatview_flush(struct lbatview* lv)
{
int ret = 0;
mutex_lock(&lv->lock);
if (lv->state != CACHE_STATE_DIRTY) {
goto out;
}
BUG_ON(!lv->pages[0]);
BUG_ON(lv->pblk == PBLK_NONE);
if (lv->pages[0]) {
ret = lbatpage_flush(lv->pages[0]);
if (ret) {
goto out;
}
}
if (lv->pages[1]) {
ret = lbatpage_flush(lv->pages[1]);
if (ret) {
goto out;
}
}
lv->state = CACHE_STATE_CLEAN;
out:
mutex_unlock(&lv->lock);
return ret;
}
int
lbatview_read(struct lbatview* lv)
{
int ret = 0;
ret = lbatview_flush(lv);
if (ret) {
return ret;
}
mutex_lock(&lv->lock);
if (lv->pages[0]) {
ret = lbatpage_read(lv->pages[0]);
if (ret) {
goto out;
}
}
if (lv->pages[1]) {
ret = lbatpage_read(lv->pages[1]);
if (ret) {
goto out;
}
}
lv->state = CACHE_STATE_CLEAN;
out:
mutex_unlock(&lv->lock);
return ret;
}
bool
lbatview_reset(struct lbatview* lv, u64 pblk, u32 count)
{
bool ret = true;
u32 zone = (pblk - CBD_HEADER_BLOCKS) / zone_len(lv->params);
BUG_ON(lv->pblk == pblk);
lv->pblk = pblk;
lv->state = CACHE_STATE_UNCACHED;
if (pbatcache_put(lv->pbatcache, lv->pbat) != 0) {
printk(KERN_ERR "%s: pbatcache_put failed\n", __func__);
ret = false;
}
lv->pbat = pbatcache_get(lv->pbatcache, zone);
if (!lv->pbat) {
ret = false;
}
if (lbatpagecache_put(lv->lpc, lv->pages[0]) != 0) {
ret = false;
}
lv->pages[0] = NULL;
if (lbatpagecache_put(lv->lpc, lv->pages[1]) != 0) {
ret = false;
}
lv->pages[1] = NULL;
if (count > 0) {
lv->pages[0] = lbatpagecache_get(lv->lpc, pblk + 0);
}
if (count > 1) {
lv->pages[1] = lbatpagecache_get(lv->lpc, pblk + 1);
}
return ret;
}
static u64
lbatview_alloc_pblk(struct lbatview* lv)
{
int ret = 0;
u32 zone = (lv->pblk - CBD_HEADER_BLOCKS) / zone_len(lv->params);
u64 pblk;
u32 zone_off;
struct pbat* pbat;
if (!lv->pbat) {
printk(KERN_ERR "%s: *** lv->pbat is NULL\n", __func__);
return PBLK_NONE;
}
pblk = pbat_alloc(lv->pbat);
if (pblk != PBLK_NONE) {
return pblk;
}
printk(KERN_INFO "%s: alloc failed for current zone\n", __func__);
ret = pbatcache_put(lv->pbatcache, lv->pbat);
if (ret) {
printk(KERN_ERR "%s: pbatcache_put failed\n", __func__);
return PBLK_NONE;
}
lv->pbat = NULL;
for (zone_off = 1;
zone_off <= zone || zone + zone_off < lv->params->nr_zones;
++zone_off) {
if (zone_off <= zone) {
pbat = pbatcache_get(lv->pbatcache, zone - zone_off);
if (!pbat) {
printk(KERN_ERR "%s: pbatcache_get failed\n", __func__);
return PBLK_NONE;
}
if (pbat_read(pbat) != 0) {
printk(KERN_ERR "%s: pbat_read failed\n", __func__);
return PBLK_NONE;
}
pblk = pbat_alloc(pbat);
if (pblk != PBLK_NONE) {
printk(KERN_INFO "%s: using zone %u, alloc=%lu\n", __func__, (zone - zone_off), (unsigned long)pblk);
lv->pbat = pbat;
return pblk;
}
ret = pbatcache_put(lv->pbatcache, pbat);
if (ret) {
printk(KERN_ERR "%s: pbatcache_put failed\n", __func__);
return PBLK_NONE;
}
}
if (zone + zone_off < lv->params->nr_zones) {
pbat = pbatcache_get(lv->pbatcache, zone + zone_off);
if (!pbat) {
printk(KERN_ERR "%s: pbatcache_get failed\n", __func__);
return PBLK_NONE;
}
if (pbat_read(pbat) != 0) {
printk(KERN_ERR "%s: pbat_read failed\n", __func__);
return PBLK_NONE;
}
pblk = pbat_alloc(pbat);
if (pblk != PBLK_NONE) {
printk(KERN_INFO "%s: using zone %u, alloc=%lu\n", __func__, (zone + zone_off), (unsigned long)pblk);
lv->pbat = pbat;
return pblk;
}
ret = pbatcache_put(lv->pbatcache, pbat);
if (ret) {
printk(KERN_ERR "%s: pbatcache_put failed\n", __func__);
return PBLK_NONE;
}
}
}
printk(KERN_ERR "%s: fail, all zones full\n", __func__);
return PBLK_NONE;
}
static int
lbatview_free_pblk(struct lbatview* lv, u64 pblk)
{
int ret = 0;
u32 zone = (lv->pblk - CBD_HEADER_BLOCKS) / zone_len(lv->params);
u32 pblk_zone;
struct pbat* pbat;
if (!lv->pbat) {
printk(KERN_ERR "%s: *** lv->pbat is NULL\n", __func__);
return -EINVAL;
}
if (pblk < CBD_HEADER_BLOCKS) {
printk(KERN_ERR "%s: pblk index is in header\n", __func__);
return -EINVAL;
}
pblk_zone = (pblk - CBD_HEADER_BLOCKS) / zone_len(lv->params);
if (pblk_zone >= lv->params->nr_zones) {
printk(KERN_ERR "%s: pblk zone out of bounds\n", __func__);
return -EINVAL;
}
pbat = pbatcache_get(lv->pbatcache, pblk_zone);
if (!pbat) {
printk(KERN_ERR "%s: pbatcache_get failed\n", __func__);
return -EINVAL;
}
ret = pbat_read(pbat);
if (ret != 0) {
printk(KERN_ERR "%s: pbat_read failed\n", __func__);
return ret;
}
ret = pbat_free(pbat, pblk);
if (pblk_zone == zone && pbat_zone(lv->pbat) != zone) {
printk(KERN_INFO "%s: freed block %lu in zone %u switching back\n", __func__, (unsigned long)pblk, zone);
ret = pbatcache_put(lv->pbatcache, lv->pbat);
if (ret) {
printk(KERN_ERR "%s: pbatcache_put failed\n", __func__);
}
lv->pbat = pbat;
}
else {
ret = pbatcache_put(lv->pbatcache, pbat);
if (ret) {
printk(KERN_ERR "%s: pbatcache_put failed\n", __func__);
}
}
return ret;
}
static u32
lbatview_elem_off(struct lbatview* lv, u64 lblk)
{
u32 lv_zone = (lv->pblk - CBD_HEADER_BLOCKS) / zone_len(lv->params);
/* The relative lblk in the zone. */
u32 zone_rel_lblk = lblk - (lv_zone * lv->params->lblk_per_zone);
/* The offset of the element in the (full) lbat. */
u32 lbat_elem_off = zone_rel_lblk * lba_len(lv->params);
/* The offset of the first view pblk. */
u32 lbatview_off = PBLK_SIZE * (lv->pblk - lbat_off(lv->params, lv_zone));
return lbat_elem_off - lbatview_off;
}
static void
lbatview_rmem(struct lbatview* lv, u32 off, u32 len, void* buf)
{
/* XXX: Convert below to a BUG_ON */
if (off + len > 2 * PAGE_SIZE) {
printk(KERN_ERR "%s: *** out of bounds\n", __func__);
return;
}
if (off < PAGE_SIZE) {
if (!lv->pages[0]) {
printk(KERN_ERR "%s *** no page0\n", __func__);
return;
}
}
if (off + len > PAGE_SIZE) {
if (!lv->pages[1]) {
printk(KERN_ERR "%s *** no page1\n", __func__);
return;
}
}
if (off < PAGE_SIZE && off + len > PAGE_SIZE) {
u32 len0 = PAGE_SIZE - off;
u8* pagebuf0 = lbatpage_get_buf(lv->pages[0], false);
u8* pagebuf1 = lbatpage_get_buf(lv->pages[1], false);
memcpy(buf, pagebuf0 + off, len0);
memcpy(buf + len0, pagebuf1, len - len0);
lbatpage_put_buf(lv->pages[1]);
lbatpage_put_buf(lv->pages[0]);
}
else {
u32 bufidx = off / PAGE_SIZE;
u32 bufoff = off % PAGE_SIZE;
u8* pagebuf = lbatpage_get_buf(lv->pages[bufidx], false);
memcpy(buf, pagebuf + bufoff, len);
lbatpage_put_buf(lv->pages[bufidx]);
}
}
static void
lbatview_wmem(struct lbatview* lv, u32 off, u32 len, void* buf)
{
/* XXX: Convert below to a BUG_ON */
if (off + len > 2 * PAGE_SIZE) {
printk(KERN_ERR "%s: *** out of bounds\n", __func__);
return;
}
if (off < PAGE_SIZE) {
if (!lv->pages[0]) {
printk(KERN_ERR "%s *** no page0\n", __func__);
return;
}
}
if (off + len > PAGE_SIZE) {
if (!lv->pages[1]) {
printk(KERN_ERR "%s *** no page1\n", __func__);
return;
}
}
if (off < PAGE_SIZE && off + len > PAGE_SIZE) {
u32 len0 = PAGE_SIZE - off;
u8* pagebuf0 = lbatpage_get_buf(lv->pages[0], true);
u8* pagebuf1 = lbatpage_get_buf(lv->pages[1], true);
memcpy(pagebuf0 + off, buf, len0);
memcpy(pagebuf1, buf + len0, len - len0);
lbatpage_put_buf(lv->pages[1]);
lbatpage_put_buf(lv->pages[0]);
}
else {
u32 bufidx = off / PAGE_SIZE;
u32 bufoff = off % PAGE_SIZE;
u8* pagebuf = lbatpage_get_buf(lv->pages[bufidx], true);
memcpy(pagebuf + bufoff, buf, len);
lbatpage_put_buf(lv->pages[bufidx]);
}
lv->state = CACHE_STATE_DIRTY;
}
int
lbatview_elem_realloc(struct lbatview* lv, u64 lblk, u32 len)
{
int ret = 0;
u32 off;
u32 n;
u64 pblk;
u32 elem_len_size = (lv->params->lblk_shift + PBLK_SHIFT > 16) ? 4 : 2;
u32 elem_pblk_size = (lv->params->nr_pblk <= 0xffff ? 2 :
(lv->params->nr_pblk <= 0xffffffff ? 4 : 6));
u32 elem_lelen;
u64 elem_lepblk;
mutex_lock(&lv->lock);
off = lbatview_elem_off(lv, lblk);
elem_lelen = __cpu_to_le32(len);
lbatview_wmem(lv, off, elem_len_size, &elem_lelen);
off += elem_len_size;
if (len == CBD_UNCOMPRESSED) {
len = PBLK_SIZE * lblk_per_pblk(lv->params);
}
for (n = 0; n < lblk_per_pblk(lv->params); ++n, off += elem_pblk_size) {
elem_lepblk = 0;
lbatview_rmem(lv, off, elem_pblk_size, &elem_lepblk);
pblk = __le64_to_cpu(elem_lepblk);
if (len > PBLK_SIZE * n) {
if (pblk == 0) {
pblk = lbatview_alloc_pblk(lv);
if (pblk == PBLK_NONE) {
printk(KERN_ERR " lbat_alloc_pblk failed\n");
ret = -ENOSPC;
goto out; /* XXX: undo */
}
elem_lepblk = __cpu_to_le64(pblk);
lbatview_wmem(lv, off, elem_pblk_size, &elem_lepblk);
}
}
else {
if (pblk != 0) {
elem_lepblk = 0;
lbatview_wmem(lv, off, elem_pblk_size, &elem_lepblk);
ret = lbatview_free_pblk(lv, pblk);
if (ret) {
printk(KERN_ERR " lbat_free_pblk failed\n");
goto out; /* XXX: undo */
}
}
}
}
out:
mutex_unlock(&lv->lock);
return ret;
}
u32
lbatview_elem_len(struct lbatview* lv, u64 lblk)
{
u32 off;
u32 elem_len_size = (lv->params->lblk_shift + PBLK_SHIFT > 16) ? 4 : 2;
u32 elem_lelen;
mutex_lock(&lv->lock);
off = lbatview_elem_off(lv, lblk);
elem_lelen = 0;
lbatview_rmem(lv, off, elem_len_size, &elem_lelen);
mutex_unlock(&lv->lock);
return __le32_to_cpu(elem_lelen);
}
u64
lbatview_elem_pblk(struct lbatview* lv, u64 lblk, u32 idx)
{
u32 off;
u32 elem_len_size = (lv->params->lblk_shift + PBLK_SHIFT > 16) ? 4 : 2;
u32 elem_pblk_size = (lv->params->nr_pblk <= 0xffff ? 2 :
(lv->params->nr_pblk <= 0xffffffff ? 4 : 6));
u64 elem_lepblk;
mutex_lock(&lv->lock);
off = lbatview_elem_off(lv, lblk) +
elem_len_size + idx * elem_pblk_size;
elem_lepblk = 0;
lbatview_rmem(lv, off, elem_pblk_size, &elem_lepblk);
mutex_unlock(&lv->lock);
return __le64_to_cpu(elem_lepblk);
}
struct lbatviewcache {
struct mutex lock;
struct cbd_params* params;
struct pbatcache* pc;
struct lbatpagecache* lpc;
unsigned int len;
struct lbatview** cache;
};
size_t
lbatviewcache_size(void)
{
return sizeof(struct lbatviewcache);
}
static bool
lbatviewcache_realloc(struct lbatviewcache* lvc, unsigned int len)
{
struct lbatview** cache;
unsigned int n;
struct lbatview* lv;
cache = kzalloc(len * sizeof(struct lbatview*), GFP_KERNEL);
if (!cache) {
return false;
}
n = 0;
if (lvc->len) {
memcpy(cache, lvc->cache, lvc->len * sizeof(struct lbatview*));
n = lvc->len;
kfree(lvc->cache);
}
lvc->len = len;
lvc->cache = cache;
while (n < len) {
lv = kmalloc(sizeof(struct lbatview), GFP_KERNEL);
if (!lv) {
return false;
}
cache[n++] = lv;
if (!lbatview_ctr(lv, lvc->params, lvc->pc, lvc->lpc)) {
return false;
}
}
return true;
}
bool
lbatviewcache_ctr(struct lbatviewcache* lvc,
struct cbd_params* params)
{
memset(lvc, 0, sizeof(struct lbatviewcache));
mutex_init(&lvc->lock);
lvc->params = params;
lvc->pc = kmalloc(pbatcache_size(), GFP_KERNEL);
if (!lvc->pc) {
return false;
}
if (!pbatcache_ctr(lvc->pc, params)) {
return false;
}
lvc->lpc = kmalloc(lbatpagecache_size(), GFP_KERNEL);
if (!lvc->lpc) {
return false;
}
if (!lbatpagecache_ctr(lvc->lpc, params)) {
return false;
}
return lbatviewcache_realloc(lvc, 1);
}
void
lbatviewcache_dtr(struct lbatviewcache* lvc)
{
unsigned int n;
struct lbatview* lv;
for (n = 0; n < lvc->len; ++n) {
lv = lvc->cache[n];
if (!lv) {
continue;
}
lbatview_dtr(lv);
if (lv->ref) {
printk(KERN_ERR "%s: lbatview ref leak: n=%u ref=%u\n", __func__, n, lv->ref);
}
kfree(lv);
}
kfree(lvc->cache);
lvc->cache = NULL;
lvc->len = 0;
lbatpagecache_dtr(lvc->lpc);
kfree(lvc->lpc);
lvc->lpc = NULL;
pbatcache_dtr(lvc->pc);
kfree(lvc->pc);
lvc->pc = NULL;
lvc->params = NULL;
}
struct lbatview*
lbatviewcache_get(struct lbatviewcache* lvc, u64 lblk)
{
u32 zone;
u64 zone_lbat_pblk;
u32 rel_lblk;
u32 lbat_offset;
u32 rel_pblk;
u64 pblk;
u32 count;
unsigned int n;
struct lbatview* lv;
zone = lblk / lvc->params->lblk_per_zone;
zone_lbat_pblk = lbat_off(lvc->params, zone);
rel_lblk = lblk - lvc->params->lblk_per_zone * zone;
lbat_offset = rel_lblk * lba_len(lvc->params);
rel_pblk = lbat_offset / PBLK_SIZE;
pblk = zone_lbat_pblk + rel_pblk;
count = (rel_pblk == lbat_len(lvc->params) - 1) ? 1 : 2;
mutex_lock(&lvc->lock);
for (n = 0; n < lvc->len; ++n) {
lv = lvc->cache[n];
mutex_lock(&lv->reflock);
if (lv->pblk == pblk) {
++lv->ref;
mutex_unlock(&lv->reflock);
goto out;
}
mutex_unlock(&lv->reflock);
}
for (n = 0; n < lvc->len; ++n) {
lv = lvc->cache[n];
mutex_lock(&lv->reflock);
if (lv->pblk == PBLK_NONE) {
goto found;
}
mutex_unlock(&lv->reflock);
}
for (n = 0; n < lvc->len; ++n) {
lv = lvc->cache[n];
mutex_lock(&lv->reflock);
if (lv->ref == 0) {
goto found;
}
mutex_unlock(&lv->reflock);
}
printk(KERN_INFO "%s: all objects in use, realloc...\n", __func__);
n = lvc->len;
if (!lbatviewcache_realloc(lvc, lvc->len * 2)) {
printk(KERN_ERR "%s: realloc failed\n", __func__);
lv = NULL;
goto out;
}
lv = lvc->cache[n];
mutex_lock(&lv->reflock);
found:
if (!lbatview_reset(lv, pblk, count)) {
mutex_unlock(&lv->reflock);
printk(KERN_ERR "%s: lbatview_reset failed\n", __func__);
lv = NULL;
goto out;
}
lv->ref = 1;
mutex_unlock(&lv->reflock);
out:
mutex_unlock(&lvc->lock);
return lv;
}
int
lbatviewcache_put(struct lbatviewcache* lvc, struct lbatview* lv)
{
int ret = 0;
if (!lv) {
return 0;
}
mutex_lock(&lvc->lock);
mutex_lock(&lv->reflock);
if (--lv->ref == 0) {
ret = lbatview_flush(lv);
if (ret) {
printk(KERN_ERR "%s: lbatview_flush failed\n", __func__);
}
}
mutex_unlock(&lv->reflock);
mutex_unlock(&lvc->lock);
return ret;
}

548
dm-compress/lbd.c Normal file
View File

@ -0,0 +1,548 @@
/*
* Copyright (c) 2019 Tom Marshall <tdm.code@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/device-mapper.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/lz4.h>
#include <linux/dm-compress.h>
struct lbd {
u64 lblk;
struct mutex reflock;
unsigned int ref;
struct mutex lock;
enum cache_state state;
struct cbd_params* params;
struct lbatviewcache* lvc;
struct lbatview* lv;
u8* lz4_wrkmem;
struct page* lz4_cpages;
u8* lz4_cbuf;
struct page* pages;
u8* buf;
};
/*
* Allocating lz4_wrkmem percpu:
*
* If the alloc is per-instance, it would need to be allocated in compress.c
* and passed around. The easiest way to pass it around is likely to make it
* part of a struct. We can't use struct compress because that is private.
* So we would need to create a struct (say, compress_percpu).
*
* If the alloc is global, we can just declare it file-local. But it would
* need to be the largest possible size. Which means we probably don't want
* to use alloc_percpu_gfp() directly, because 1mb chunks are probably not
* that common. So suppose we allocate a percpu vector of page ptrs.
*
* #define COMPRESS_MAX_INPUT_SIZE (1 << LBLK_SHIFT_MAX)
* #define COMPRESS_LZ4_BOUND LZ4_COMPRESSBOUND(COMPRESS_MAX_INPUT_SIZE)
* #define WRKMEM_PAGES DIV_ROUND_UP(COMPRESS_LZ4_BOUND, PAGE_SIZE)
* typedef struct page*[WRKMEM_PAGES] lz4_wrkmem_pagev_t;
*
* g_lz4_wrkmem = alloc_percpu_gfp(lz4_wrkmem_pagev_t, GFP_IO);
*
* That's not bad at all. But how do we alloc (and free) the actual pages?
*
* pagev = get_cpu_var(g_lz4_wrkmem);
* put_cpu_var(pagev);
*
* free_percpu(g_lz4_wrkmem);
*/
static inline bool
lblk_is_zeros(struct cbd_params* params, struct lbd* lbd)
{
#ifdef CBD_DETECT_ZERO_BLOCKS
u32 off;
u32 len = PBLK_SIZE * lblk_per_pblk(params);
for (off = 0; off < len; ++off) {
if (lbd->lblk_buf[off]) {
return false;
}
}
return true;
#else
return false;
#endif
}
/*
* Compress dc->lblk into dc->lz4_cbuf
*
* Returns number of bytes in cbuf or 0 for failure.
*/
static size_t
lblk_compress(struct lbd* lbd)
{
int ret;
void *dbuf = lbd->buf;
u32 dlen = PBLK_SIZE * lblk_per_pblk(lbd->params);
void *cbuf = lbd->lz4_cbuf;
u32 clen = PBLK_SIZE * lblk_per_pblk(lbd->params);
ret = LZ4_compress_default(dbuf, cbuf, dlen, clen, lbd->lz4_wrkmem);
if (ret <= 0) {
return 0;
}
return (size_t)ret;
}
/*
* Decompress dc->lz4_cbuf of size clen into dc->lblk
*
* Returns 0 for success, <0 for failure.
*/
static int
lblk_decompress(struct lbd* lbd, u32 clen)
{
int ret;
void *cbuf = lbd->lz4_cbuf;
void *dbuf = lbd->buf;
u32 dlen = PBLK_SIZE * lblk_per_pblk(lbd->params);
ret = LZ4_decompress_safe(cbuf, dbuf, clen, dlen);
if (ret != dlen) {
printk(KERN_ERR "%s: failed, ret=%d (expected %u)\n", __func__, ret, (unsigned int)dlen);
return -1;
}
return 0;
}
bool
lbd_ctr(struct lbd* lbd,
struct cbd_params* params,
struct lbatviewcache* lvc)
{
memset(lbd, 0, sizeof(struct lbd));
lbd->lblk = LBLK_NONE;
mutex_init(&lbd->reflock);
lbd->ref = 0;
mutex_init(&lbd->lock);
lbd->state = CACHE_STATE_UNCACHED;
lbd->params = params;
lbd->lvc = lvc;
lbd->lv = NULL;
lbd->lz4_wrkmem = kmalloc(LZ4_compressBound(PBLK_SIZE * lblk_per_pblk(lbd->params)), GFP_KERNEL);
if (!lbd->lz4_wrkmem) {
return false;
}
lbd->lz4_cpages = cbd_alloc_pages(lblk_per_pblk(lbd->params));
if (!lbd->lz4_cpages) {
return false;
}
lbd->lz4_cbuf = page_address(lbd->lz4_cpages);
lbd->pages = cbd_alloc_pages(lblk_per_pblk(lbd->params));
if (!lbd->pages) {
return false;
}
lbd->buf = page_address(lbd->pages);
return true;
}
void
lbd_dtr(struct lbd* lbd)
{
if (lbatviewcache_put(lbd->lvc, lbd->lv) != 0) {
printk(KERN_ERR "%s: lbatviewcache_put failed\n", __func__);
}
lbd->lv = NULL;
cbd_free_pages(lbd->pages, lblk_per_pblk(lbd->params));
lbd->pages = NULL;
lbd->buf = NULL;
cbd_free_pages(lbd->lz4_cpages, lblk_per_pblk(lbd->params));
lbd->lz4_cpages = NULL;
lbd->lz4_cbuf = NULL;
kfree(lbd->lz4_wrkmem);
lbd->lz4_wrkmem = NULL;
}
static void
lbd_flush_endio(struct bio* bio)
{
int ret;
cbd_free_page(bio->bi_io_vec[0].bv_page);
ret = pblk_endio(bio);
if (ret) {
/* XXX: ...? */
printk(KERN_ERR "%s: I/O failed\n", __func__);
}
}
int
lbd_flush(struct lbd* lbd)
{
int ret = 0;
u32 c_len;
u32 elem_len;
u8* p;
u32 n;
u64 pblk;
struct page* iopagev[1];
mutex_lock(&lbd->lock);
if (lbd->state != CACHE_STATE_DIRTY) {
goto out;
}
if (lblk_is_zeros(lbd->params, lbd)) {
c_len = 0;
elem_len = 0;
p = NULL;
}
else {
c_len = lblk_compress(lbd);
if (c_len > 0) {
size_t c_blkrem = c_len % PBLK_SIZE;
if (c_blkrem) {
memset(lbd->lz4_cbuf + c_len, 0, c_blkrem);
}
elem_len = c_len;
p = lbd->lz4_cbuf;
}
else {
c_len = PBLK_SIZE * lblk_per_pblk(lbd->params);
elem_len = CBD_UNCOMPRESSED;
p = lbd->buf;
}
}
ret = lbatview_elem_realloc(lbd->lv, lbd->lblk, elem_len);
if (ret) {
goto out;
}
for (n = 0; n * PBLK_SIZE < c_len; ++n, p += PBLK_SIZE) {
pblk = lbatview_elem_pblk(lbd->lv, lbd->lblk, n);
iopagev[0] = cbd_alloc_page();
if (!iopagev[0]) {
printk(KERN_ERR "%s: out of memory\n", __func__);
ret = -ENOMEM;
goto out;
}
memcpy(page_address(iopagev[0]), p, PBLK_SIZE);
pblk_write(lbd->params, pblk, 1, iopagev, lbd_flush_endio, lbd);
}
lbd->state = CACHE_STATE_CLEAN;
out:
mutex_unlock(&lbd->lock);
return ret;
}
int
lbd_read(struct lbd* lbd)
{
int ret = 0;
u32 c_len;
u64 pblk;
struct page* iopagev[1];
ret = lbd_flush(lbd);
if (ret) {
return ret;
}
mutex_lock(&lbd->lock);
if (lbd->state == CACHE_STATE_CLEAN) {
goto out;
}
ret = lbatview_read(lbd->lv);
if (ret) {
printk(KERN_ERR "%s: lbat_read failed\n", __func__);
goto out;
}
c_len = lbatview_elem_len(lbd->lv, lbd->lblk);
if (c_len == 0) {
memset(lbd->buf, 0, PBLK_SIZE * lblk_per_pblk(lbd->params));
}
else {
bool is_compressed = true;
u32 d_len = PBLK_SIZE * lblk_per_pblk(lbd->params);
u32 n;
u8* p;
if (c_len == CBD_UNCOMPRESSED) {
is_compressed = false;
c_len = d_len;
}
p = lbd->lz4_cbuf;
for (n = 0; n * PBLK_SIZE < c_len; ++n, p += PBLK_SIZE) {
pblk = lbatview_elem_pblk(lbd->lv, lbd->lblk, n);
if (pblk == 0) {
printk(KERN_ERR "%s: pblk is zero at lblk=%lu n=%u\n", __func__,
(unsigned long)lbd->lblk, n);
ret = -EIO;
goto out;
}
iopagev[0] = virt_to_page(p);
ret = pblk_read_wait(lbd->params, pblk, 1, iopagev);
if (ret) {
goto out;
}
}
if (is_compressed) {
if (lblk_decompress(lbd, c_len) != 0) {
printk(KERN_ERR " decompress failed\n");
ret = -EIO;
goto out;
}
}
else {
memcpy(lbd->buf, lbd->lz4_cbuf, d_len);
}
}
out:
mutex_unlock(&lbd->lock);
return ret;
}
bool
lbd_reset(struct lbd* lbd, u64 lblk)
{
bool ret = true;
BUG_ON(lbd->lblk == lblk);
lbd->lblk = lblk;
lbd->state = CACHE_STATE_UNCACHED;
if (lbatviewcache_put(lbd->lvc, lbd->lv) != 0) {
printk(KERN_ERR "%s: lbatviewcache_put failed\n", __func__);
ret = false;
}
lbd->lv = lbatviewcache_get(lbd->lvc, lblk);
if (!lbd->lv) {
printk(KERN_ERR "%s: lbatviewcache_get failed\n", __func__);
ret = false;
}
return ret;
}
void
lbd_data_read(struct lbd* lbd, u32 off, u32 len, u8* buf)
{
/* XXX: convert to BUG_ON */
if (off + len > PBLK_SIZE * lblk_per_pblk(lbd->params)) {
printk(KERN_ERR "%s: out of bounds\n", __func__);
return;
}
mutex_lock(&lbd->lock);
memcpy(buf, lbd->buf + off, len);
mutex_unlock(&lbd->lock);
}
void
lbd_data_write(struct lbd* lbd, u32 off, u32 len, const u8* buf)
{
/* XXX: convert to BUG_ON */
if (off + len > PBLK_SIZE * lblk_per_pblk(lbd->params)) {
printk(KERN_ERR "%s: out of bounds\n", __func__);
return;
}
mutex_lock(&lbd->lock);
memcpy(lbd->buf + off, buf, len);
lbd->state = CACHE_STATE_DIRTY;
mutex_unlock(&lbd->lock);
}
struct lbdcache
{
struct mutex lock;
struct cbd_params* params;
struct lbatviewcache* lvc;
unsigned int len;
struct lbd** cache;
};
size_t
lbdcache_size(void)
{
return sizeof(struct lbdcache);
}
static bool
lbdcache_realloc(struct lbdcache* lc, unsigned int len)
{
struct lbd** cache;
unsigned int n;
struct lbd* lbd;
cache = kzalloc(len * sizeof(struct lbd*), GFP_KERNEL);
if (!cache) {
return false;
}
n = 0;
if (lc->len) {
memcpy(cache, lc->cache, lc->len * sizeof(struct lbd*));
n = lc->len;
kfree(lc->cache);
}
lc->len = len;
lc->cache = cache;
while (n < len) {
lbd = kmalloc(sizeof(struct lbd), GFP_KERNEL);
if (!lbd) {
return false;
}
cache[n++] = lbd;
if (!lbd_ctr(lbd, lc->params, lc->lvc)) {
return false;
}
}
return true;
}
bool
lbdcache_ctr(struct lbdcache* lc,
struct cbd_params* params)
{
memset(lc, 0, sizeof(struct lbdcache));
mutex_init(&lc->lock);
lc->params = params;
lc->lvc = kzalloc(lbatviewcache_size(), GFP_KERNEL);
if (!lc->lvc) {
return false;
}
if (!lbatviewcache_ctr(lc->lvc, params)) {
return false;
}
return lbdcache_realloc(lc, 1);
}
void
lbdcache_dtr(struct lbdcache* lc)
{
unsigned int n;
struct lbd* lbd;
for (n = 0; n < lc->len; ++n) {
lbd = lc->cache[n];
if (!lbd) {
continue;
}
lbd_dtr(lbd);
if (lbd->ref) {
printk(KERN_ERR "%s: lbd ref leak: n=%u ref=%u\n", __func__, n, lbd->ref);
}
kfree(lbd);
}
kfree(lc->cache);
lc->cache = NULL;
lc->len = 0;
lbatviewcache_dtr(lc->lvc);
kfree(lc->lvc);
lc->lvc = NULL;
lc->params = NULL;
}
struct lbd*
lbdcache_get(struct lbdcache* lc, u64 lblk)
{
unsigned int n;
struct lbd* lbd;
mutex_lock(&lc->lock);
for (n = 0; n < lc->len; ++n) {
lbd = lc->cache[n];
mutex_lock(&lbd->reflock);
if (lbd->lblk == lblk) {
++lbd->ref;
mutex_unlock(&lbd->reflock);
goto out;
}
mutex_unlock(&lbd->reflock);
}
for (n = 0; n < lc->len; ++n) {
lbd = lc->cache[n];
mutex_lock(&lbd->reflock);
if (lbd->lblk == LBLK_NONE) {
goto found;
}
mutex_unlock(&lbd->reflock);
}
for (n = 0; n < lc->len; ++n) {
lbd = lc->cache[n];
mutex_lock(&lbd->reflock);
if (lbd->ref == 0) {
goto found;
}
mutex_unlock(&lbd->reflock);
}
printk(KERN_INFO "%s: all objects in use, realloc...\n", __func__);
n = lc->len;
if (!lbdcache_realloc(lc, lc->len * 2)) {
printk(KERN_ERR "%s: realloc failed\n", __func__);
lbd = NULL;
goto out;
}
lbd = lc->cache[n];
mutex_lock(&lbd->reflock);
found:
if (!lbd_reset(lbd, lblk)) {
mutex_unlock(&lbd->reflock);
printk(KERN_ERR "%s: lbd_reset failed\n", __func__);
lbd = NULL;
goto out;
}
lbd->ref = 1;
mutex_unlock(&lbd->reflock);
out:
mutex_unlock(&lc->lock);
return lbd;
}
int
lbdcache_put(struct lbdcache* lc, struct lbd* lbd)
{
int ret = 0;
if (!lbd) {
return 0;
}
mutex_lock(&lc->lock);
mutex_lock(&lbd->reflock);
if (--lbd->ref == 0) {
ret = lbd_flush(lbd);
if (ret) {
printk(KERN_ERR "%s: lbd_flush failed\n", __func__);
}
}
mutex_unlock(&lbd->reflock);
mutex_unlock(&lc->lock);
return ret;
}

370
dm-compress/pbat.c Normal file
View File

@ -0,0 +1,370 @@
/*
* Copyright (c) 2019 Tom Marshall <tdm.code@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/device-mapper.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/lz4.h>
#include <linux/dm-compress.h>
struct pbat {
u32 zone;
struct mutex reflock;
unsigned int ref;
struct mutex lock;
enum cache_state state;
struct cbd_params* params;
struct page* pages;
u8* buf;
};
bool
pbat_ctr(struct pbat* pbat,
struct cbd_params* params)
{
memset(pbat, 0, sizeof(struct pbat));
pbat->zone = ZONE_NONE;
mutex_init(&pbat->reflock);
pbat->ref = 0;
mutex_init(&pbat->lock);
pbat->state = CACHE_STATE_UNCACHED;
pbat->params = params;
pbat->pages = cbd_alloc_pages(pbat_len(params));
if (!pbat->pages) {
printk(KERN_ERR "%s: Failed to alloc pbat_buf\n", __func__);
return false;
}
pbat->buf = page_address(pbat->pages);
return true;
}
void
pbat_dtr(struct pbat* pbat)
{
pbat->buf = NULL;
cbd_free_pages(pbat->pages, pbat_len(pbat->params));
pbat->pages = NULL;
}
static void
pbat_flush_endio(struct bio* bio)
{
int ret;
unsigned int n;
for (n = 0; n < bio->bi_max_vecs; ++n) {
cbd_free_page(bio->bi_io_vec[0].bv_page);
}
ret = pblk_endio(bio);
if (ret) {
/*
* XXX:
* Set dm_compress.io_error?
* Set pbat.io_error?
* Set pbat.zone = ZONE_ERR?
*/
printk(KERN_ERR "%s: XXX: I/O failed\n", __func__);
}
}
int
pbat_flush(struct pbat* pbat)
{
int ret = 0;
u32 count = pbat_len(pbat->params);
struct page* iopagev[count];
u64 pblk;
u32 n;
u8* iobuf;
mutex_lock(&pbat->lock);
if (pbat->state != CACHE_STATE_DIRTY) {
goto out;
}
pblk = pbat_off(pbat->params, pbat->zone);
if (!cbd_alloc_pagev(iopagev, count)) {
printk(KERN_ERR "%s: out of memory\n", __func__);
ret = -ENOMEM;
goto out;
}
for (n = 0; n < count; ++n) {
iobuf = page_address(iopagev[n]);
memcpy(iobuf, pbat->buf + n * PBLK_SIZE, PBLK_SIZE);
}
pblk_write(pbat->params, pblk, count, iopagev, pbat_flush_endio, pbat);
pbat->state = CACHE_STATE_CLEAN;
out:
mutex_unlock(&pbat->lock);
return ret;
}
int
pbat_read(struct pbat* pbat)
{
int ret = 0;
u32 count = pbat_len(pbat->params);
struct page* pagev[count];
u64 pblk;
u32 n;
ret = pbat_flush(pbat);
if (ret) {
return ret;
}
mutex_lock(&pbat->lock);
if (pbat->state == CACHE_STATE_CLEAN) {
goto out;
}
pblk = pbat_off(pbat->params, pbat->zone);
for (n = 0; n < count; ++n) {
pagev[n] = virt_to_page(pbat->buf + n * PBLK_SIZE);
}
ret = pblk_read_wait(pbat->params, pblk, count, pagev);
if (ret) {
goto out;
}
pbat->state = CACHE_STATE_CLEAN;
out:
mutex_unlock(&pbat->lock);
return ret;
}
void
pbat_reset(struct pbat* pbat, u32 zone)
{
BUG_ON(pbat->zone == zone);
pbat->zone = zone;
pbat->state = CACHE_STATE_UNCACHED;
}
u32
pbat_zone(struct pbat* pbat)
{
return pbat->zone;
}
u64
pbat_alloc(struct pbat* pbat)
{
u32 pblk_count = pbat_len(pbat->params) * PBLK_SIZE_BITS;
u64 idx;
mutex_lock(&pbat->lock);
idx = cbd_bitmap_alloc(pbat->buf, pblk_count);
if (idx == pblk_count) {
idx = PBLK_NONE;
goto out;
}
pbat->state = CACHE_STATE_DIRTY;
out:
mutex_unlock(&pbat->lock);
return idx + zone_data_off(pbat->params, pbat->zone);
}
int
pbat_free(struct pbat* pbat, u64 pblk)
{
u32 zone_pblk_count = pbat_len(pbat->params) * PBLK_SIZE_BITS;
u32 zone;
u32 idx;
BUG_ON(pblk < CBD_HEADER_BLOCKS);
zone = (pblk - CBD_HEADER_BLOCKS) / zone_len(pbat->params);
BUG_ON(zone != pbat->zone);
if (pblk < zone_data_off(pbat->params, zone)) {
printk(KERN_ERR "%s: pblk in metadata\n", __func__);
return -EINVAL;
}
idx = pblk - zone_data_off(pbat->params, zone);
BUG_ON(idx >= zone_pblk_count);
mutex_lock(&pbat->lock);
cbd_bitmap_free(pbat->buf, idx);
pbat->state = CACHE_STATE_DIRTY;
mutex_unlock(&pbat->lock);
return 0;
}
struct pbatcache {
struct mutex lock;
struct cbd_params* params;
unsigned int len;
struct pbat** cache;
};
size_t
pbatcache_size(void)
{
return sizeof(struct pbatcache);
}
static bool
pbatcache_realloc(struct pbatcache* pc, unsigned int len)
{
struct pbat** cache;
unsigned int n;
struct pbat* pbat;
cache = kzalloc(len * sizeof(struct pbat*), GFP_KERNEL);
if (!cache) {
return false;
}
n = 0;
if (pc->len) {
memcpy(cache, pc->cache, pc->len * sizeof(struct pbat*));
n = pc->len;
kfree(pc->cache);
}
pc->len = len;
pc->cache = cache;
while (n < len) {
pbat = kmalloc(sizeof(struct pbat), GFP_KERNEL);
if (!pbat) {
return false;
}
cache[n++] = pbat;
if (!pbat_ctr(pbat, pc->params)) {
return false;
}
}
return true;
}
bool
pbatcache_ctr(struct pbatcache* pc,
struct cbd_params* params)
{
memset(pc, 0, sizeof(struct pbatcache));
mutex_init(&pc->lock);
pc->params = params;
return pbatcache_realloc(pc, 1);
}
void
pbatcache_dtr(struct pbatcache* pc)
{
unsigned int n;
struct pbat* pbat;
for (n = 0; n < pc->len; ++n) {
pbat = pc->cache[n];
if (!pbat) {
continue;
}
pbat_dtr(pbat);
if (pbat->ref) {
printk(KERN_ERR "%s: pbat ref leak: n=%u ref=%u\n", __func__, n, pbat->ref);
}
kfree(pbat);
}
kfree(pc->cache);
pc->cache = NULL;
pc->len = 0;
pc->params = NULL;
}
struct pbat*
pbatcache_get(struct pbatcache* pc, u32 zone)
{
unsigned int n;
struct pbat* pbat;
mutex_lock(&pc->lock);
for (n = 0; n < pc->len; ++n) {
pbat = pc->cache[n];
mutex_lock(&pbat->reflock);
if (pbat->zone == zone) {
++pbat->ref;
mutex_unlock(&pbat->reflock);
goto out;
}
mutex_unlock(&pbat->reflock);
}
for (n = 0; n < pc->len; ++n) {
pbat = pc->cache[n];
mutex_lock(&pbat->reflock);
if (pbat->zone == ZONE_NONE) {
goto found;
}
mutex_unlock(&pbat->reflock);
}
for (n = 0; n < pc->len; ++n) {
pbat = pc->cache[n];
mutex_lock(&pbat->reflock);
if (pbat->ref == 0) {
goto found;
}
mutex_unlock(&pbat->reflock);
}
printk(KERN_INFO "%s: all objects in use, realloc...\n", __func__);
n = pc->len;
if (!pbatcache_realloc(pc, pc->len * 2)) {
printk(KERN_ERR "%s: realloc failed\n", __func__);
pbat = NULL;
goto out;
}
pbat = pc->cache[n];
mutex_lock(&pbat->reflock);
found:
pbat_reset(pbat, zone);
pbat->ref = 1;
mutex_unlock(&pbat->reflock);
out:
mutex_unlock(&pc->lock);
return pbat;
}
int
pbatcache_put(struct pbatcache* pc, struct pbat* pbat)
{
int ret = 0;
if (!pbat) {
return 0;
}
mutex_lock(&pc->lock);
mutex_lock(&pbat->reflock);
if (--pbat->ref == 0) {
ret = pbat_flush(pbat);
if (ret) {
printk(KERN_ERR "%s: pbat_flush failed\n", __func__);
}
}
mutex_unlock(&pbat->reflock);
mutex_unlock(&pc->lock);
return ret;
}

206
dm-compress/util.c Normal file
View File

@ -0,0 +1,206 @@
/*
* Copyright (c) 2019 Tom Marshall <tdm.code@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/device-mapper.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/lz4.h>
#include <linux/dm-compress.h>
/**************************************
* Core memory management.
**************************************/
struct page*
cbd_alloc_page(void)
{
return alloc_page(GFP_KERNEL);
}
void
cbd_free_page(struct page* page)
{
__free_page(page);
}
struct page*
cbd_alloc_pages(size_t len)
{
return alloc_pages(GFP_KERNEL, get_order(len * PAGE_SIZE));
}
void
cbd_free_pages(struct page* pages, size_t len)
{
__free_pages(pages, get_order(len * PAGE_SIZE));
}
bool
cbd_alloc_pagev(struct page** pagev, size_t len)
{
size_t n;
for (n = 0; n < len; ++n) {
pagev[n] = cbd_alloc_page();
if (!pagev[n]) {
goto err;
}
}
return true;
err:
while (n--) {
cbd_free_page(pagev[n]);
pagev[n] = NULL;
}
return false;
}
void
cbd_free_pagev(struct page** pagev, size_t len)
{
size_t n;
for (n = 0; n < len; ++n) {
cbd_free_page(pagev[n]);
pagev[n] = NULL;
}
}
/**************************************
* Core low-level I/O.
*
* pblk count are in units of physical blocks (4096 bytes), NOT sectors.
* data is a page address (obtained via __get_free_pages and friends).
**************************************/
static struct bio*
pblk_io_prepare(struct cbd_params* params, unsigned int op,
u64 pblk, u32 count, struct page** pagev)
{
struct bio* bio;
u32 n;
bio = bio_alloc(GFP_KERNEL, count);
if (!bio) {
printk(KERN_ERR "%s: out of memory\n", __func__);
return NULL;
}
bio_set_dev(bio, (struct block_device*)params->priv);
bio->bi_opf = op;
bio->bi_iter.bi_sector = (pblk << (PBLK_SHIFT - SECTOR_SHIFT));
for (n = 0; n < count; ++n) {
if (bio_add_page(bio, pagev[n], PAGE_SIZE, 0) != PAGE_SIZE) {
BUG();
}
}
return bio;
}
int
pblk_read_wait(struct cbd_params* params,
u64 pblk, u32 count, struct page** pagev)
{
int ret;
struct bio* bio;
bio = pblk_io_prepare(params, REQ_OP_READ, pblk, count, pagev);
if (!bio) {
printk(KERN_ERR "%s: out of memory\n", __func__);
return -ENOMEM;
}
ret = submit_bio_wait(bio);
if (ret != 0) {
printk(KERN_ERR "%s: submit_bio_wait failed: %d\n", __func__, ret);
}
bio_put(bio);
return ret;
}
int
pblk_read(struct cbd_params* params,
u64 pblk, u32 count, struct page** pagev,
pblk_endio_t endio, void* endio_priv)
{
int ret;
struct bio* bio;
bio = pblk_io_prepare(params, REQ_OP_READ, pblk, count, pagev);
if (!bio) {
printk(KERN_ERR "%s: out of memory\n", __func__);
return -ENOMEM;
}
bio->bi_end_io = endio;
bio->bi_private = endio_priv;
ret = submit_bio(bio);
if (ret != 0) {
printk(KERN_ERR "%s: submit_bio_wait failed: %d\n", __func__, ret);
}
bio_put(bio);
return ret;
}
void
pblk_write(struct cbd_params* params,
u64 pblk, u32 count, struct page** pagev,
pblk_endio_t endio, void* endio_priv)
{
struct bio* bio;
bio = pblk_io_prepare(params, REQ_OP_WRITE, pblk, count, pagev);
if (!bio) {
printk(KERN_ERR "%s: out of memory\n", __func__);
return;
}
bio->bi_end_io = endio;
bio->bi_private = endio_priv;
if (pblk < CBD_HEADER_BLOCKS) {
printk(KERN_ERR "%s: *** Attempt to write header\n", __func__);
dump_stack();
bio->bi_status = BLK_STS_IOERR;
endio(bio);
return;
}
submit_bio(bio);
}
int
pblk_endio(struct bio* bio)
{
int ret;
ret = blk_status_to_errno(bio->bi_status);
bio_put(bio);
return ret;
}

View File

@ -2,19 +2,19 @@
#define _LINUX_DM_COMPRESS_H
#ifndef SECTOR_SHIFT
#define SECTOR_SHIFT 9
#define SECTOR_SHIFT 9
#endif
#ifndef SECTOR_SIZE
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
#endif
#define PBLK_SHIFT 12
#define PBLK_SIZE (1 << PBLK_SHIFT)
#define PBLK_SIZE_BITS (PBLK_SIZE * BITS_PER_BYTE)
#define PBLK_SHIFT 12
#define PBLK_SIZE (1 << PBLK_SHIFT)
#define PBLK_SIZE_BITS (PBLK_SIZE * BITS_PER_BYTE)
#define PBLK_PER_SECTOR (1 << (PBLK_SHIFT - SECTOR_SHIFT))
#define LBLK_SHIFT_MIN 1
#define LBLK_SHIFT_MAX (20 - PBLK_SHIFT)
#define LBLK_SHIFT_MIN 1
#define LBLK_SHIFT_MAX (20 - PBLK_SHIFT)
#define CBD_HEADER_BLOCKS 1
@ -33,12 +33,13 @@ enum cbd_alg {
};
struct cbd_params {
u8 algorithm; /* cbd_alg */
u8 algorithm; /* enum cbd_alg */
u8 compression; /* 0..9 */
u16 lblk_shift;
u64 nr_pblk;
u32 nr_zones;
u32 lblk_per_zone;
void* priv;
};
struct cbd_header {
@ -48,10 +49,10 @@ struct cbd_header {
struct cbd_params params;
};
struct lbat_elem
struct lba
{
u32 len; /* Compressed length */
u64 pblk[1]; /* Vector of physical blocks */
u32 len; /* Compressed length */
u64 pblk[1]; /* Vector of physical blocks */
};
static inline void
@ -151,6 +152,36 @@ put64_le(u8** raw, u64 val)
*raw += sizeof(leval);
}
/* XXX: Use kernel bit functions */
static inline u32
cbd_bitmap_alloc(u8* buf, u32 bitsize)
{
u32 off = 0;
u32 bit = 0;
for (off = 0; off < bitsize / BITS_PER_BYTE; ++off) {
if (buf[off] != 0xff) {
while (buf[off] & (1 << bit)) {
++bit;
}
buf[off] |= (1 << bit);
break;
}
}
return off * BITS_PER_BYTE + bit;
}
/* XXX: Use kernel bit functions */
static inline void
cbd_bitmap_free(u8* buf, u32 idx)
{
u32 off = idx / BITS_PER_BYTE;
u32 bit = idx % BITS_PER_BYTE;
buf[off] &= ~(1 << bit);
}
static inline u32
@ -166,7 +197,7 @@ pbat_len(const struct cbd_params* params)
}
static inline u32
lbat_elem_len(const struct cbd_params* params)
lba_len(const struct cbd_params* params)
{
u32 elem_len_bytes = (params->lblk_shift + PBLK_SHIFT > 16) ? 4 : 2;
u32 elem_pblk_bytes = (params->nr_pblk <= 0xffff ? 2 :
@ -177,7 +208,7 @@ lbat_elem_len(const struct cbd_params* params)
static inline u32
lbat_len(const struct cbd_params* params)
{
return DIV_ROUND_UP(params->lblk_per_zone * lbat_elem_len(params), PBLK_SIZE);
return DIV_ROUND_UP(params->lblk_per_zone * lba_len(params), PBLK_SIZE);
}
static inline u32
@ -255,103 +286,242 @@ cbd_header_put(u8* buf, const struct cbd_header* header)
put32_le(&buf, header->params.lblk_per_zone);
}
/*
* XXX:
* nr_bits = pbat_len(params) * PBLK_SIZE;
* bit = find_next_zero_bit_le(buf, nr_bits);
* if (bit < nr_bits) {
* set_bit_le(bit, buf);
* }
* return bit;
*/
static inline u32
cbd_bitmap_alloc(u8* buf, u32 bitsize)
lba_len_get(const struct cbd_params* params, const u8* buf)
{
u32 off = 0;
u32 bit = 0;
for (off = 0; off < bitsize / BITS_PER_BYTE; ++off) {
if (buf[off] != 0xff) {
while (buf[off] & (1 << bit)) {
++bit;
}
buf[off] |= (1 << bit);
break;
}
if (params->lblk_shift + PBLK_SHIFT > 16) {
return get32_le(&buf);
}
else {
return get16_le(&buf);
}
return off * BITS_PER_BYTE + bit;
}
/*
* XXX:
* clear_bit_le(bit, buf);
*/
static inline void
cbd_bitmap_free(u8* buf, u32 idx)
lba_len_put(const struct cbd_params* params, u8* buf, u32 val)
{
u32 off = idx / BITS_PER_BYTE;
u32 bit = idx % BITS_PER_BYTE;
if (params->lblk_shift + PBLK_SHIFT > 16) {
put32_le(&buf, val);
}
else {
put16_le(&buf, val);
}
}
buf[off] &= ~(1 << bit);
static inline u64
lba_pblk_get(const struct cbd_params* params, const u8* buf, u32 idx)
{
const u8* p = buf;
p += (params->lblk_shift + PBLK_SHIFT > 16) ? 4 : 2;
if (params->nr_pblk <= 0xffff) {
p += 2 * idx;
return get16_le(&p);
}
else if (params->nr_pblk <= 0xffffffff) {
p += 4 * idx;
return get32_le(&p);
}
else {
p += 6 * idx;
return get48_le(&p);
}
}
static inline void
lbat_elem_get(const struct cbd_params* params,
const u8* buf, struct lbat_elem* elem)
lba_pblk_put(const struct cbd_params* params, u8* buf, u32 idx, u64 val)
{
u8* p = buf;
p += (params->lblk_shift + PBLK_SHIFT > 16) ? 4 : 2;
if (params->nr_pblk <= 0xffff) {
p += 2 * idx;
put16_le(&p, val);
}
else if (params->nr_pblk <= 0xffffffff) {
p += 4 * idx;
put32_le(&p, val);
}
else {
p += 6 * idx;
put48_le(&p, val);
}
}
static inline void
lba_get(const struct cbd_params* params,
const u8* buf, struct lba* lba)
{
u32 n;
if (params->lblk_shift + PBLK_SHIFT > 16) {
elem->len = get32_le(&buf);
lba->len = get32_le(&buf);
}
else {
elem->len = get16_le(&buf);
lba->len = get16_le(&buf);
}
if (params->nr_pblk <= 0xffff) {
for (n = 0; n < lblk_per_pblk(params); ++n) {
elem->pblk[n] = get16_le(&buf);
lba->pblk[n] = get16_le(&buf);
}
}
else if (params->nr_pblk <= 0xffffffff) {
for (n = 0; n < lblk_per_pblk(params); ++n) {
elem->pblk[n] = get32_le(&buf);
lba->pblk[n] = get32_le(&buf);
}
}
else {
for (n = 0; n < lblk_per_pblk(params); ++n) {
elem->pblk[n] = get48_le(&buf);
lba->pblk[n] = get48_le(&buf);
}
}
}
static inline void
lbat_elem_put(const struct cbd_params* params,
u8* buf, const struct lbat_elem* elem)
lba_put(const struct cbd_params* params,
u8* buf, const struct lba* lba)
{
u32 n;
if (params->lblk_shift + PBLK_SHIFT > 16) {
put32_le(&buf, elem->len);
put32_le(&buf, lba->len);
}
else {
put16_le(&buf, elem->len);
put16_le(&buf, lba->len);
}
if (params->nr_pblk <= 0xffff) {
for (n = 0; n < lblk_per_pblk(params); ++n) {
put16_le(&buf, elem->pblk[n]);
put16_le(&buf, lba->pblk[n]);
}
}
else if (params->nr_pblk <= 0xffffffff) {
for (n = 0; n < lblk_per_pblk(params); ++n) {
put32_le(&buf, elem->pblk[n]);
put32_le(&buf, lba->pblk[n]);
}
}
else {
for (n = 0; n < lblk_per_pblk(params); ++n) {
put48_le(&buf, elem->pblk[n]);
put48_le(&buf, lba->pblk[n]);
}
}
}
#ifdef __KERNEL__
#define ZONE_NONE (u32)(~0)
#define PBLK_NONE (u64)(~0)
#define LBLK_NONE (u64)(~0)
enum cache_state {
CACHE_STATE_UNCACHED,
CACHE_STATE_CLEAN,
CACHE_STATE_DIRTY,
CACHE_STATE_MAX
};
typedef void (*pblk_endio_t)(struct bio*);
/* Single page allocator */
struct page*
cbd_alloc_page(void);
void cbd_free_page(struct page* page);
/* Multiple page allocator */
struct page*
cbd_alloc_pages(size_t len);
void cbd_free_pages(struct page* pages, size_t len);
/* Vector page allocator */
bool cbd_alloc_pagev(struct page** pagev, size_t len);
void cbd_free_pagev(struct page** pagev, size_t len);
int pblk_read_wait(struct cbd_params* params,
u64 pblk, u32 count, struct page** pagev);
int pblk_read(struct cbd_params* params,
u64 pblk, u32 count, struct page** pagev,
pblk_endio_t endio, void* endio_priv);
void pblk_write(struct cbd_params* params,
u64 pblk, u32 count, struct page** pagev,
pblk_endio_t endio, void* endio_priv);
int pblk_endio(struct bio* bio);
struct pbat;
bool pbat_ctr(struct pbat* pbat,
struct cbd_params* params);
void pbat_dtr(struct pbat* pbat);
int pbat_flush(struct pbat* pbat);
int pbat_read(struct pbat* pbat);
void pbat_reset(struct pbat* pbat, u32 zone);
u32 pbat_zone(struct pbat* pbat);
u64 pbat_alloc(struct pbat* pbat);
int pbat_free(struct pbat* pbat, u64 pblk);
struct pbatcache;
size_t pbatcache_size(void);
bool pbatcache_ctr(struct pbatcache* pbatcache,
struct cbd_params* params);
void pbatcache_dtr(struct pbatcache* pbatcache);
struct pbat*
pbatcache_get(struct pbatcache* pbatcache, u32 zone);
int pbatcache_put(struct pbatcache* pbatcache, struct pbat* pbat);
struct lbatpage;
bool lbatpage_ctr(struct lbatpage* lp, struct cbd_params* params);
void lbatpage_dtr(struct lbatpage* lp);
int lbatpage_flush(struct lbatpage* lp);
int lbatpage_read(struct lbatpage* lp);
void lbatpage_reset(struct lbatpage* lp, u64 pblk);
u8* lbatpage_get_buf(struct lbatpage* lp, bool rw);
void lbatpage_put_buf(struct lbatpage* lp);
struct lbatpagecache;
size_t lbatpagecache_size(void);
bool lbatpagecache_ctr(struct lbatpagecache* lpc,
struct cbd_params* params);
void lbatpagecache_dtr(struct lbatpagecache* lpc);
struct lbatpage*
lbatpagecache_get(struct lbatpagecache* lpc, u64 pblk);
int lbatpagecache_put(struct lbatpagecache* lpc, struct lbatpage* lpi);
struct lbatview;
bool lbatview_ctr(struct lbatview* lv,
struct cbd_params* params,
struct pbatcache* pbatcache,
struct lbatpagecache* lpc);
void lbatview_dtr(struct lbatview* lv);
int lbatview_flush(struct lbatview* lv);
int lbatview_read(struct lbatview* lv);
bool lbatview_reset(struct lbatview* lv, u64 pblk, u32 count);
int lbatview_elem_realloc(struct lbatview* lv, u64 lblk, u32 len);
u32 lbatview_elem_len(struct lbatview* lv, u64 lblk);
u64 lbatview_elem_pblk(struct lbatview* lv, u64 lblk, u32 idx);
struct lbatviewcache;
size_t lbatviewcache_size(void);
bool lbatviewcache_ctr(struct lbatviewcache* lvc,
struct cbd_params* params);
void lbatviewcache_dtr(struct lbatviewcache* lvc);
struct lbatview*
lbatviewcache_get(struct lbatviewcache* lvc, u64 lblk);
int lbatviewcache_put(struct lbatviewcache* lvc, struct lbatview* lbv);
struct lbd;
bool lbd_ctr(struct lbd* lbd,
struct cbd_params* params,
struct lbatviewcache* lvc);
void lbd_dtr(struct lbd* lbd);
int lbd_flush(struct lbd* lbd);
int lbd_read(struct lbd* lbd);
bool lbd_reset(struct lbd* lbd, u64 lblk);
void lbd_data_read(struct lbd* lbd, u32 off, u32 len, u8* buf);
void lbd_data_write(struct lbd* lbd, u32 off, u32 len, const u8* buf);
struct lbdcache;
size_t lbdcache_size(void);
bool lbdcache_ctr(struct lbdcache* lc,
struct cbd_params* params);
void lbdcache_dtr(struct lbdcache* lc);
struct lbd*
lbdcache_get(struct lbdcache* lc, u64 lblk);
int lbdcache_put(struct lbdcache* lc, struct lbd* lbd);
#endif
#endif /* _LINUX_DM_COMPRESS_H */

View File

@ -63,45 +63,69 @@ check_one_lblk(const struct cbd_params* params,
const struct zone_metadata* zm,
u8** pblk_used)
{
struct lbat_elem* elem;
u8* elem_buf;
struct lba* lba;
u8* lba_buf;
u32 c_len;
u32 n;
u64 pblk;
u32 rel_pblk;
elem = calloc(1, offsetof(struct lbat_elem, pblk[lblk_per_pblk(params)]));
elem_buf = zm->lbat + lblk * lbat_elem_len(params);
lbat_elem_get(params, elem_buf, elem);
printf(" lblk[%u]: len=%u\n", lblk, elem->len);
lba = calloc(1, offsetof(struct lba, pblk[lblk_per_pblk(params)]));
lba_buf = zm->lbat + lblk * lba_len(params);
lba_get(params, lba_buf, lba);
if (lba->len) {
if (lba->len == CBD_UNCOMPRESSED) {
printf(" lblk[%u]: UNCOMPRESSED\n", lblk);
}
else {
printf(" lblk[%u]: len=%u\n", lblk, lba->len);
}
}
if (lba->len > PBLK_SIZE * lblk_per_pblk(params)) {
printf(" Length out of bounds\n");
return;
}
c_len = (lba->len == CBD_UNCOMPRESSED) ? PBLK_SIZE * lblk_per_pblk(params) : lba->len;
for (n = 0; n < lblk_per_pblk(params); ++n) {
pblk = elem->pblk[n];
if (elem->len > PBLK_SIZE * n) {
/* XXX: allow out-of-zone allocs for v1.1 */
if (pblk < zone_data_off(params, zone) || pblk >= zone_off(params, zone + 1)) {
printf("Alloc out of bounds for zone %u block %u index %u: %lu\n",
(unsigned int)zone, lblk, n,
(unsigned long)pblk);
pblk = lba->pblk[n];
if (c_len > PBLK_SIZE * n) {
u32 pblk_zone;
u32 rel_pblk;
if (pblk < CBD_HEADER_BLOCKS) {
printf(" [%u] :E: Alloc in header: %lu\n", n, pblk);
continue;
}
rel_pblk = pblk - zone_data_off(params, zone);
printf(" [%u] pblk=%lu rel_pblk=%u\n", n, (unsigned long)pblk, rel_pblk);
if (pblk_used[zone][rel_pblk/8] & (1 << (rel_pblk % 8))) {
printf("Duplicate allocation for zone %u block %u\n",
(unsigned int)zone, (unsigned int)rel_pblk);
pblk_zone = (pblk - CBD_HEADER_BLOCKS) / zone_len(params);
if (pblk_zone >= params->nr_zones) {
printf(" [%u] :E: Alloc beyond end: %lu\n", n, pblk);
continue;
}
pblk_used[zone][rel_pblk/8] |= (1 << (rel_pblk % 8));
if (pblk < zone_data_off(params, pblk_zone)) {
printf(" [%u] :E: Alloc in metadata: %lu\n", n, pblk);
continue;
}
rel_pblk = pblk - zone_data_off(params, pblk_zone);
/* XXX: Cannot happen? */
if (rel_pblk >= pbat_len(params) * PBLK_SIZE_BITS) {
printf(" [%u] :E: Alloc out of zone: %lu\n", n, pblk);
continue;
}
printf(" [%u] pblk=%lu pblk_zone=%u rel_pblk=%u\n", n,
(unsigned long)pblk, pblk_zone, rel_pblk);
if (pblk_used[pblk_zone][rel_pblk/8] & (1 << (rel_pblk % 8))) {
printf(" [%u] :E: Duplicate allocation for zone %u block %u\n",
n, pblk_zone, rel_pblk);
continue;
}
pblk_used[pblk_zone][rel_pblk/8] |= (1 << (rel_pblk % 8));
}
else {
if (pblk) {
printf("Unexpected pblk alloc for zone %u block %u index %u: %lu\n",
(unsigned int)zone, lblk, n,
(unsigned long)pblk);
printf(" [%u] :E: Unexpected pblk alloc: %lu\n", n, pblk);
}
}
}
free(elem);
free(lba);
}
static void
@ -110,12 +134,29 @@ check_one_zone(const struct cbd_params* params,
const struct zone_metadata* zm,
u8** pblk_used)
{
u32 lblk_alloc_len;
u32 n;
bool zone_empty;
u32 lblk;
printf("Zone %u: alloc [%lu .. %lu]\n",
printf("Zone %u: lbat=[%lu..%lu] alloc=[%lu .. %lu]\n",
(unsigned int)zone,
(unsigned long)zone_off(params, zone),
(unsigned long)(zone_data_off(params, zone) - 1),
(unsigned long)zone_data_off(params, zone),
(unsigned long)zone_off(params, zone + 1));
zone_empty = true;
lblk_alloc_len = params->lblk_per_zone * lba_len(params);
for (n = 0; n < lblk_alloc_len; ++n) {
if (zm->lbat[n]) {
zone_empty = false;
break;
}
}
if (zone_empty) {
printf(" [empty]\n");
return;
}
for (lblk = 0; lblk < params->lblk_per_zone; ++lblk) {
check_one_lblk(params, zone, lblk, zm, pblk_used);
}
@ -130,7 +171,7 @@ check_zone_metadata(const struct cbd_params* params,
pblk_used = calloc(params->nr_zones, sizeof(void*));
for (zone = 0; zone < params->nr_zones; ++zone) {
pblk_used[zone] = calloc(1, pbat_len(params));
pblk_used[zone] = calloc(pbat_len(params), PBLK_SIZE);
}
for (zone = 0; zone < params->nr_zones; ++zone) {
@ -164,7 +205,7 @@ cbd_check(const char* dev,
zmvec = calloc(header.params.nr_zones, sizeof(struct zone_metadata));
for (zone = 0; zone < header.params.nr_zones; ++zone) {
zmvec[zone].pbat = calloc(1, PBLK_SIZE);
zmvec[zone].pbat = calloc(pbat_len(&header.params), PBLK_SIZE);
pblk_read(devfd,
pbat_off(&header.params, zone),
pbat_len(&header.params),