/*
 * Copyright (c) 2019 Tom Marshall <tdm.code@gmail.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA. 
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/device-mapper.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>

#include <linux/lz4.h>
#include <linux/zlib.h>

#include <linux/dm-compress.h>

struct lbd {
    u64                         lblk;
    struct mutex                reflock;
    unsigned int                ref;

    struct mutex                lock;
    enum cache_state            state;
    struct cbd_params*          params;
    struct lbatviewcache*       lvc;
    struct lbatview*            lv;
    void*                       percpu;
    struct page*                pages;
    u8*                         buf;
};

/*
 * Allocating lz4_wrkmem percpu:
 *
 * If the alloc is per-instance, it would need to be allocated in compress.c
 * and passed around.  The easiest way to pass it around is likely to make it
 * part of a struct.  We can't use struct compress because that is private.
 * So we would need to create a struct (say, compress_percpu).
 *
 * If the alloc is global, we can just declare it file-local.  But it would
 * need to be the largest possible size.  Which means we probably don't want
 * to use alloc_percpu_gfp() directly, because 1mb chunks are probably not
 * that common.  So suppose we allocate a percpu vector of page ptrs.
 *
 * #define COMPRESS_MAX_INPUT_SIZE (1 << LBLK_SHIFT_MAX)
 * #define COMPRESS_LZ4_BOUND      LZ4_COMPRESSBOUND(COMPRESS_MAX_INPUT_SIZE)
 * #define WRKMEM_PAGES            DIV_ROUND_UP(COMPRESS_LZ4_BOUND, PAGE_SIZE)
 * typedef struct page*[WRKMEM_PAGES] lz4_wrkmem_pagev_t;
 *
 * g_lz4_wrkmem = alloc_percpu_gfp(lz4_wrkmem_pagev_t, GFP_IO);
 *
 * That's not bad at all.  But how do we alloc (and free) the actual pages?
 *
 * pagev = get_cpu_var(g_lz4_wrkmem);
 * put_cpu_var(pagev);
 *
 * free_percpu(g_lz4_wrkmem);
 */

static inline bool
lblk_is_zeros(struct cbd_params* params, struct lbd* lbd)
{
#ifdef CBD_DETECT_ZERO_BLOCKS
    u32 off;
    u32 len = PBLK_SIZE * lblk_per_pblk(params);

    for (off = 0; off < len; ++off) {
        if (lbd->lblk_buf[off]) {
            return false;
        }
    }

    return true;
#else
    return false;
#endif
}

struct lblk_compress_state {
    struct page*        pages;
    u8*                 buf;
#ifdef COMPRESS_HAVE_LZ4
    u8*                 lz4_workmem;
#endif
#ifdef COMPRESS_HAVE_ZLIB
    z_stream            zlib_cstream;
    z_stream            zlib_dstream;
#endif
};

static struct lblk_compress_state*
lblk_get_compress_state(void* percpu, const struct cbd_params* params, int cpu)
{
    struct lblk_compress_state** statep;

    statep = per_cpu_ptr(percpu, cpu);
    return *statep;
}

#ifdef COMPRESS_HAVE_LZ4
static size_t
lblk_compress_lz4(struct lbd* lbd)
{
    int clen;
    int cpu;
    struct lblk_compress_state* state;

    cpu = get_cpu();
    state = lblk_get_compress_state(lbd->percpu, lbd->params, cpu);
    if (!state) {
        put_cpu();
        return 0;
    }
    clen = LZ4_compress_fast(lbd->buf, state->buf,
                                PBLK_SIZE * lblk_per_pblk(lbd->params),
                                PBLK_SIZE * (lblk_per_pblk(lbd->params) - 1),
                                lbd->params->compression, state->lz4_workmem);
    if (clen <= 0) {
        put_cpu();
        return 0;
    }
    memcpy(lbd->buf, state->buf, clen);
    put_cpu();

    return (size_t)clen;
}

static bool
lblk_decompress_lz4(struct lbd* lbd, u32 clen)
{
    int ret;
    int cpu;
    struct lblk_compress_state* state;
    u32 dlen = PBLK_SIZE * lblk_per_pblk(lbd->params);

    cpu = get_cpu();
    state = lblk_get_compress_state(lbd->percpu, lbd->params, cpu);
    if (!state) {
        put_cpu();
        return false;
    }
    ret = LZ4_decompress_safe(lbd->buf,
                              state->buf,
                              clen,
                              dlen);
    if (ret != dlen) {
        put_cpu();
        return false;
    }
    memcpy(lbd->buf, state->buf, dlen);
    put_cpu();

    return true;
}
#endif

#ifdef COMPRESS_HAVE_ZLIB
static size_t
lblk_compress_zlib(struct lbd* lbd)
{
    int ret;
    int cpu;
    struct lblk_compress_state* state;
    z_stream* stream;

    cpu = get_cpu();
    state = lblk_get_compress_state(lbd->percpu, lbd->params, cpu);
    if (!state) {
        put_cpu();
        return 0;
    }
    stream = &state->zlib_cstream;
    ret = zlib_deflateReset(stream);
    BUG_ON(ret != Z_OK);
    stream->next_in = lbd->buf;
    stream->avail_in = PBLK_SIZE * lblk_per_pblk(lbd->params);
    stream->next_out = state->buf;
    stream->avail_out = PBLK_SIZE * (lblk_per_pblk(lbd->params) - 1);
    ret = zlib_deflate(stream, Z_FINISH);
    if (ret != Z_STREAM_END) {
        put_cpu();
        return 0;
    }
    memcpy(lbd->buf, state->buf, stream->total_out);
    put_cpu();

    return stream->total_out;
}

static bool
lblk_decompress_zlib(struct lbd* lbd, u32 clen)
{
    int ret;
    int cpu;
    struct lblk_compress_state* state;
    z_stream* stream;
    u32 dlen = PBLK_SIZE * lblk_per_pblk(lbd->params);

    cpu = get_cpu();
    state = lblk_get_compress_state(lbd->percpu, lbd->params, cpu);
    if (!state) {
        put_cpu();
        return false;
    }
    stream = &state->zlib_dstream;
    ret = zlib_inflateReset(stream);
    BUG_ON(ret != Z_OK);
    stream->next_in = lbd->buf;
    stream->avail_in = clen;
    stream->next_out = state->buf;
    stream->avail_out = dlen;
    ret = zlib_inflate(stream, Z_SYNC_FLUSH);
    /* See xxx */
    if (ret == Z_OK && !stream->avail_in && stream->avail_out) {
        u8 zerostuff = 0;
        stream->next_in = &zerostuff;
        stream->avail_in = 1;
        ret = zlib_inflate(stream, Z_FINISH);
    }
    if (ret != Z_STREAM_END || stream->total_out != dlen) {
        put_cpu();
        return false;
    }
    memcpy(lbd->buf, state->buf, dlen);
    put_cpu();

    return true;
}
#endif

/*
 * Compress dc->lblk into dc->lz4_cbuf
 *
 * Returns number of bytes in cbuf or 0 for failure.
 */
static size_t
lblk_compress(struct lbd* lbd)
{
#ifdef COMPRESS_HAVE_LZ4
    if (lbd->params->algorithm == CBD_ALG_LZ4) {
        return lblk_compress_lz4(lbd);
    }
#endif
#ifdef COMPRESS_HAVE_ZLIB
    if (lbd->params->algorithm == CBD_ALG_ZLIB) {
        return lblk_compress_zlib(lbd);
    }
#endif
    return 0;
}

/*
 * Decompress dc->lz4_cbuf of size clen into dc->lblk
 *
 * Returns 0 for success, <0 for failure.
 */
static int
lblk_decompress(struct lbd* lbd, u32 clen)
{
#ifdef COMPRESS_HAVE_LZ4
    if (lbd->params->algorithm == CBD_ALG_LZ4) {
        return lblk_decompress_lz4(lbd, clen);
    }
#endif
#ifdef COMPRESS_HAVE_ZLIB
    if (lbd->params->algorithm == CBD_ALG_ZLIB) {
        return lblk_decompress_zlib(lbd, clen);
    }
#endif
    return false;
}

static bool
lbd_ctr(struct lbd* lbd,
        struct cbd_params* params,
        struct lbatviewcache* lvc,
        void* percpu)
{
    memset(lbd, 0, sizeof(struct lbd));
    lbd->lblk = LBLK_NONE;
    mutex_init(&lbd->reflock);
    lbd->ref = 0;
    mutex_init(&lbd->lock);
    lbd->state = CACHE_STATE_UNCACHED;
    lbd->params = params;
    lbd->lvc = lvc;
    lbd->lv = NULL;
    lbd->percpu = percpu;
    lbd->pages = cbd_alloc_pages(lblk_per_pblk(lbd->params));
    if (!lbd->pages) {
        return false;
    }
    lbd->buf = page_address(lbd->pages);

    return true;
}

static void
lbd_dtr(struct lbd* lbd)
{
    if (lbatviewcache_put(lbd->lvc, lbd->lv) != 0) {
        printk(KERN_ERR "%s: lbatviewcache_put failed\n", __func__);
    }
    lbd->buf = NULL;
    cbd_free_pages(lbd->pages, lblk_per_pblk(lbd->params));
    lbd->pages = NULL;
    lbd->percpu = NULL;
    lbd->lv = NULL;
    lbd->lvc = NULL;
}

static void
lbd_flush_endio(struct bio* bio)
{
    struct lbd* lbd = bio->bi_private;
    int ret;

    cbd_free_page(bio->bi_io_vec[0].bv_page);
    ret = pblk_endio(bio);
    if (ret) {
        printk(KERN_ERR "%s: I/O failed\n", __func__);
        lbd->state = CACHE_STATE_ERROR;
    }
}

static int
lbd_flush(struct lbd* lbd)
{
    int ret = 0;
    u32 c_len;
    u32 elem_len;
    u8* p;
    u32 n;
    u64 pblk;
    struct page* iopagev[1];

    mutex_lock(&lbd->lock);
    if (lbd->state != CACHE_STATE_DIRTY) {
        if (lbd->state == CACHE_STATE_ERROR) {
            ret = -EIO;
            goto out;
        }
        goto clean;
    }

    if (lblk_is_zeros(lbd->params, lbd)) {
        c_len = 0;
        elem_len = 0;
    }
    else {
        c_len = lblk_compress(lbd);
        if (c_len > 0) {
            size_t c_blkrem = c_len % PBLK_SIZE;
            if (c_blkrem) {
                memset(lbd->buf + c_len, 0, c_blkrem);
            }
            elem_len = c_len;
        }
        else {
            c_len = PBLK_SIZE * lblk_per_pblk(lbd->params);
            elem_len = CBD_UNCOMPRESSED;
        }
    }

    ret = lbatview_elem_realloc(lbd->lv, lbd->lblk, elem_len);
    if (ret) {
        goto out;
    }
    p = lbd->buf;
    for (n = 0; n * PBLK_SIZE < c_len; ++n, p += PBLK_SIZE) {
        pblk = lbatview_elem_pblk(lbd->lv, lbd->lblk, n);
        if (pblk == PBLK_NONE) {
            ret = -EIO;
            goto out;
        }
        iopagev[0] = cbd_alloc_page();
        if (!iopagev[0]) {
            printk(KERN_ERR "%s: out of memory\n", __func__);
            ret = -ENOMEM;
            goto out;
        }
        memcpy(page_address(iopagev[0]), p, PBLK_SIZE);
        pblk_write(lbd->params, pblk, 1, iopagev, lbd_flush_endio, lbd);
    }

clean:
    ret = lbatviewcache_put(lbd->lvc, lbd->lv);
    lbd->lv = NULL;
    if (ret) {
        lbd->state = CACHE_STATE_ERROR;
        goto out;
    }
    lbd->state = CACHE_STATE_CLEAN;

out:
    mutex_unlock(&lbd->lock);
    return ret;
}

static bool
lbd_reset(struct lbd* lbd, u64 lblk)
{
    if (lbd->lv) { printk(KERN_ERR "%s: lbatview leak\n", __func__); }

    lbd->lv = lbatviewcache_get(lbd->lvc, lblk);
    if (!lbd->lv) {
        printk(KERN_ERR "%s: lbatviewcache_get failed\n", __func__);
        return false;
    }
    lbd->lblk = lblk;
    lbd->state = CACHE_STATE_UNCACHED;

    return true;
}

int
lbd_read(struct lbd* lbd)
{
    int ret = 0;
    u32 c_len;
    u64 pblk;
   struct page* iopagev[1];

    mutex_lock(&lbd->lock);
    if (lbd->state != CACHE_STATE_UNCACHED) {
        goto out;
    }
    ret = lbatview_read(lbd->lv);
    if (ret) {
        goto out;
    }
    c_len = lbatview_elem_len(lbd->lv, lbd->lblk);
    if (c_len == 0) {
        memset(lbd->buf, 0, PBLK_SIZE * lblk_per_pblk(lbd->params));
    }
    else {
        bool is_compressed = true;
        u32 d_len = PBLK_SIZE * lblk_per_pblk(lbd->params);
        u32 n;
        u8* p;

        if (c_len == CBD_UNCOMPRESSED) {
            is_compressed = false;
            c_len = d_len;
        }
        p = lbd->buf;
        for (n = 0; n * PBLK_SIZE < c_len; ++n, p += PBLK_SIZE) {
            pblk = lbatview_elem_pblk(lbd->lv, lbd->lblk, n);
            if (pblk == PBLK_NONE) {
                ret = -EIO;
                goto out;
            }
            /* XXX: check pblk not in metadata? */
            iopagev[0] = virt_to_page(p);
            ret = pblk_read_wait(lbd->params, pblk, 1, iopagev);
            if (ret) {
                goto out;
            }
        }
        if (is_compressed) {
            if (!lblk_decompress(lbd, c_len)) {
                printk(KERN_ERR "  decompress failed\n");
                ret = -EIO;
                goto out;
            }
        }
    }
    lbd->state = CACHE_STATE_CLEAN;

out:
    mutex_unlock(&lbd->lock);
    return ret;
}

void
lbd_data_read(struct lbd* lbd, u32 off, u32 len, u8* buf)
{
    /* XXX: convert to BUG_ON */
    if (off + len > PBLK_SIZE * lblk_per_pblk(lbd->params)) {
        printk(KERN_ERR "%s: out of bounds\n", __func__);
        return;
    }
    mutex_lock(&lbd->lock);
    BUG_ON(lbd->state == CACHE_STATE_UNCACHED);
    memcpy(buf, lbd->buf + off, len);
    mutex_unlock(&lbd->lock);
}

void
lbd_data_write(struct lbd* lbd, u32 off, u32 len, const u8* buf)
{
    /* XXX: convert to BUG_ON */
    if (off + len > PBLK_SIZE * lblk_per_pblk(lbd->params)) {
        printk(KERN_ERR "%s: out of bounds\n", __func__);
        return;
    }
    mutex_lock(&lbd->lock);
    BUG_ON(lbd->state == CACHE_STATE_UNCACHED);
    memcpy(lbd->buf + off, buf, len);
    lbd->state = CACHE_STATE_DIRTY;
    mutex_unlock(&lbd->lock);
}

struct lbdcache
{
    struct mutex                lock;
    struct cbd_params*          params;
    void*                       percpu;
    struct lbatviewcache*       lvc;
    unsigned int                len;
    struct lbd**                cache;
};

size_t
lbdcache_size(void)
{
    return sizeof(struct lbdcache);
}

static bool
lbdcache_realloc(struct lbdcache* lc, unsigned int len)
{
    struct lbd** cache;
    unsigned int n;
    struct lbd* lbd;

    cache = kzalloc(len * sizeof(struct lbd*), GFP_KERNEL);
    if (!cache) {
        return false;
    }
    n = 0;
    if (lc->len) {
        memcpy(cache, lc->cache, lc->len * sizeof(struct lbd*));
        n = lc->len;
        kfree(lc->cache);
    }
    lc->len = len;
    lc->cache = cache;
    while (n < len) {
        lbd = kmalloc(sizeof(struct lbd), GFP_KERNEL);
        if (!lbd) {
            return false;
        }
        cache[n++] = lbd;
        if (!lbd_ctr(lbd, lc->params, lc->lvc, lc->percpu)) {
            return false;
        }
    }

    return true;
}

static bool
lbdcache_alloc_compress_state(void* percpu, const struct cbd_params* params, int cpu)
{
    struct lblk_compress_state* state;
    struct lblk_compress_state** statep;
    size_t workmem_len;
#ifdef COMPRESS_HAVE_ZLIB
    int ret;
#endif

    state = kzalloc(sizeof(struct lblk_compress_state), GFP_NOWAIT);
    if (!state) {
        printk(KERN_ERR "%s: failed to alloc state\n", __func__);
        return false;
    }
    statep = per_cpu_ptr(percpu, cpu);
    *statep = state;
    state->pages = cbd_alloc_pages_nowait(lblk_per_pblk(params));
    if (!state->pages) {
        return false;
    }
    state->buf = page_address(state->pages);
#ifdef COMPRESS_HAVE_LZ4
    workmem_len = LZ4_compressBound(PBLK_SIZE * lblk_per_pblk(params));
    state->lz4_workmem = kzalloc(workmem_len, GFP_NOWAIT);
    if (!state->lz4_workmem) {
        return false;
    }
#endif
#ifdef COMPRESS_HAVE_ZLIB
    workmem_len = zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL);
    state->zlib_cstream.workspace = kzalloc(workmem_len, GFP_NOWAIT);
    if (!state->zlib_cstream.workspace) {
        return false;
    }
    ret = zlib_deflateInit2(&state->zlib_cstream, params->compression,
                            Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL,
                            Z_DEFAULT_STRATEGY);
    BUG_ON(ret != Z_OK);
    workmem_len = zlib_inflate_workspacesize();
    state->zlib_dstream.workspace = kzalloc(workmem_len, GFP_NOWAIT);
    if (!state->zlib_dstream.workspace) {
        return false;
    }
    ret = zlib_inflateInit2(&state->zlib_dstream, DEF_WBITS);
    BUG_ON(ret != Z_OK);
#endif

    return true;
}

static void
lbdcache_free_compress_state(void* percpu, const struct cbd_params* params, int cpu)
{
    struct lblk_compress_state** statep;
    struct lblk_compress_state* state;

    statep = per_cpu_ptr(percpu, cpu);
    state = *statep;
    if (!state) {
        return;
    }
#ifdef COMPRESS_HAVE_ZLIB
    kfree(state->zlib_dstream.workspace);
    kfree(state->zlib_cstream.workspace);
#endif
#ifdef COMPRESS_HAVE_LZ4
    kfree(state->lz4_workmem);
#endif
    cbd_free_pages(state->pages, lblk_per_pblk(params));
    kfree(state);
}

bool
lbdcache_ctr(struct lbdcache* lc,
        struct cbd_params* params)
{
    int cpu;

    memset(lc, 0, sizeof(struct lbdcache));
    mutex_init(&lc->lock);
    lc->params = params;
    lc->percpu = alloc_percpu(void*);
    for (cpu = 0; cpu < num_online_cpus(); ++cpu) {
        if (!lbdcache_alloc_compress_state(lc->percpu, params, cpu)) {
            return false;
        }
    }
    lc->lvc = kzalloc(lbatviewcache_size(), GFP_KERNEL);
    if (!lc->lvc) {
        return false;
    }
    if (!lbatviewcache_ctr(lc->lvc, params)) {
        return false;
    }

    return lbdcache_realloc(lc, 1024);
}

void
lbdcache_dtr(struct lbdcache* lc)
{
    unsigned int n;
    struct lbd* lbd;
    int cpu;

    for (n = 0; n < lc->len; ++n) {
        lbd = lc->cache[n];
        if (!lbd) {
            continue;
        }
        lbd_dtr(lbd);
        if (lbd->ref) {
            printk(KERN_ERR "%s: lbd ref leak: n=%u ref=%u\n", __func__, n, lbd->ref);
        }
        kfree(lbd);
    }
    kfree(lc->cache);
    lc->cache = NULL;
    lc->len = 0;
    lbatviewcache_dtr(lc->lvc);
    kfree(lc->lvc);
    lc->lvc = NULL;
    for (cpu = 0; cpu < num_online_cpus(); ++cpu) {
        lbdcache_free_compress_state(lc->percpu, lc->params, cpu);
    }
    free_percpu(lc->percpu);
    lc->percpu = NULL;
    lc->params = NULL;
}

struct lbd*
lbdcache_get(struct lbdcache* lc, u64 lblk)
{
    unsigned int n;
    struct lbd* lbd;

    mutex_lock(&lc->lock);
    for (n = 0; n < lc->len; ++n) {
        lbd = lc->cache[n];
        mutex_lock(&lbd->reflock);
        if (lbd->lblk == lblk) {
            if (lbd->ref == 0) {
                goto found;
            }
            ++lbd->ref;
            mutex_unlock(&lbd->reflock);
            goto out;
        }
        mutex_unlock(&lbd->reflock);
    }
    for (n = 0; n < lc->len; ++n) {
        lbd = lc->cache[n];
        mutex_lock(&lbd->reflock);
        if (lbd->lblk == LBLK_NONE) {
            goto found;
        }
        mutex_unlock(&lbd->reflock);
    }
    for (n = 0; n < lc->len; ++n) {
        lbd = lc->cache[n];
        mutex_lock(&lbd->reflock);
        if (lbd->ref == 0 && lbd->state != CACHE_STATE_ERROR) {
            goto found;
        }
        mutex_unlock(&lbd->reflock);
    }
    printk(KERN_INFO "%s: all objects in use, realloc...\n", __func__);
    n = lc->len;
    if (!lbdcache_realloc(lc, lc->len * 2)) {
        printk(KERN_ERR "%s: realloc failed\n", __func__);
        lbd = NULL;
        goto out;
    }
    printk(KERN_INFO "%s: realloc done, using n=%u\n", __func__, n);
    lbd = lc->cache[n];
    mutex_lock(&lbd->reflock);

found:
    if (!lbd_reset(lbd, lblk)) {
        mutex_unlock(&lbd->reflock);
        printk(KERN_ERR "%s: lbd_reset failed\n", __func__);
        lbd = NULL;
        goto out;
    }
    lbd->ref = 1;
    mutex_unlock(&lbd->reflock);

out:
    mutex_unlock(&lc->lock);

    return lbd;
}

int
lbdcache_put(struct lbdcache* lc, struct lbd* lbd)
{
    int ret = 0;

    if (!lbd) {
        return 0;
    }
    mutex_lock(&lc->lock);
    mutex_lock(&lbd->reflock);
    if (--lbd->ref == 0) {
        ret = lbd_flush(lbd);
        if (ret) {
            printk(KERN_ERR "%s: lbd_flush failed\n", __func__);
        }
    }
    mutex_unlock(&lbd->reflock);
    mutex_unlock(&lc->lock);

    return ret;
}