qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH] VMDK: add monolithic flat image support


From: Kevin Wolf
Subject: Re: [Qemu-devel] [PATCH] VMDK: add monolithic flat image support
Date: Mon, 30 May 2011 13:41:55 +0200
User-agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.17) Gecko/20110428 Fedora/3.1.10-1.fc15 Thunderbird/3.1.10

Am 30.05.2011 09:49, schrieb Fam Zheng:
> VMDK multiple file images can not be recognized for now. This patch is
> adding monolithic flat support to it, that is the image type with two
> files, one text descriptor file and a plain data file. This type of
> image can be created in VMWare, with the options "allocate all disk
> space now" and "store virtual disk as a single file" checked.
> 
> A VmdkExtent structure is introduced to hold the image "extent"
> information, which makes further adding multi extents support of VMDK
> easy. An image creating option "flat" is added for creating flat
> (preallocated) image.
> 
> Signed-off-by: Feiran (Fam) Zheng <address@hidden>
> ---
>  block/vmdk.c |  686 
> +++++++++++++++++++++++++++++++++++++++++++++-------------
>  block_int.h  |    2 +
>  qemu-img.c   |   12 +-
>  3 files changed, 542 insertions(+), 158 deletions(-)
> 
> diff --git a/block/vmdk.c b/block/vmdk.c
> index 8fc9d67..726ad3a 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -30,6 +30,8 @@
>  #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
>  #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
> 
> +#define VMDK_FLAT_BACKING 0

Is there any specific reason why it's useful to enable the corresponding
code only conditionally? Generally no code that is #ifdefed out should
be committed.

> +
>  typedef struct {
>      uint32_t version;
>      uint32_t flags;
> @@ -60,7 +62,10 @@ typedef struct {
> 
>  #define L2_CACHE_SIZE 16
> 
> -typedef struct BDRVVmdkState {
> +typedef struct VmdkExtent {
> +    BlockDriverState *file;
> +    int flat;

bool?

And just to make sure I understand the terminology: flat means !sparse,
i.e. doesn't have L1/L2 tables?

> +    int64_t sectors;
>      int64_t l1_table_offset;
>      int64_t l1_backup_table_offset;
>      uint32_t *l1_table;
> @@ -73,8 +78,15 @@ typedef struct BDRVVmdkState {
>      uint32_t l2_cache_offsets[L2_CACHE_SIZE];
>      uint32_t l2_cache_counts[L2_CACHE_SIZE];
> 
> -    unsigned int cluster_sectors;
>      uint32_t parent_cid;
> +    unsigned int cluster_sectors;
> +} VmdkExtent;
> +
> +typedef struct BDRVVmdkState {
> +    int has_descriptor_file;

bool

> +    int extent_size;

Is this the array size of extents? If so, num_extents would be a better
name.

> +    int cid_update;
> +    VmdkExtent *extents;
>  } BDRVVmdkState;
> 
>  typedef struct VmdkMetaData {
> @@ -88,21 +100,30 @@ typedef struct VmdkMetaData {
>  static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
>  {
>      uint32_t magic;
> -
> +    char *cid_p, *ct_p, *extent_p;
> +    char cid_str[] = "CID";
> +    char ct_str[] = "createType";
> +    char extent_str[] = "RW";

Don't use variables for this but just use the literal strings below.

>      if (buf_size < 4)
>          return 0;
>      magic = be32_to_cpu(*(uint32_t *)buf);
>      if (magic == VMDK3_MAGIC ||
> -        magic == VMDK4_MAGIC)
> +        magic == VMDK4_MAGIC) {
>          return 100;
> -    else
> +    } else {
> +        cid_p = strstr((char *)buf, cid_str);
> +        ct_p = strstr((char *)buf, ct_str);
> +        extent_p = strstr((char *)buf, extent_str);
> +        if (cid_p && ct_p && extent_p)
> +            return 100;
> +    }
>          return 0;

The return 0; isn't correctly indented any more.

>  }
> 
>  #define CHECK_CID 1
> 
>  #define SECTOR_SIZE 512
> -#define DESC_SIZE 20*SECTOR_SIZE     // 20 sectors of 512 bytes each
> +#define DESC_SIZE (20 * SECTOR_SIZE)    // 20 sectors of 512 bytes each
>  #define HEADER_SIZE 512                      // first sector of 512 bytes
> 
>  static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
> @@ -111,11 +132,11 @@ static uint32_t vmdk_read_cid(BlockDriverState
> *bs, int parent)
>      uint32_t cid;
>      const char *p_name, *cid_str;
>      size_t cid_str_size;
> +    BDRVVmdkState *s = bs->opaque;
> +    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
> 
> -    /* the descriptor offset = 0x200 */
> -    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
> +    if (bdrv_pread(bs->file, desc_offset, desc, DESC_SIZE) != DESC_SIZE)
>          return 0;
> -
>      if (parent) {
>          cid_str = "parentCID";
>          cid_str_size = sizeof("parentCID");
> @@ -128,19 +149,52 @@ static uint32_t vmdk_read_cid(BlockDriverState
> *bs, int parent)
>          p_name += cid_str_size;
>          sscanf(p_name,"%x",&cid);
>      }
> -
>      return cid;
>  }
> 
> +#ifdef _WIN32
> +static int64_t get_file_size(const char *filename)
> +{
> +    typedef DWORD (WINAPI * get_compressed_t)
> +                    (const char *filename, DWORD *high);
> +    get_compressed_t get_compressed;
> +    struct _stati64 st;
> +
> +    /* WinNT support GetCompressedFileSize to determine allocate size */
> +    get_compressed = (get_compressed_t)
> +        GetProcAddress(GetModuleHandle("kernel32"), 
> "GetCompressedFileSizeA");
> +    if (get_compressed) {
> +        DWORD high, low;
> +        low = get_compressed(filename, &high);
> +        if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR)
> +        return (((int64_t) high) << 32) + low;
> +    }
> +
> +    if (_stati64(filename, &st) < 0)
> +        return -1;
> +    return st.st_size;
> +}
> +#else
> +static int64_t get_file_size(const char *filename)
> +{
> +    struct stat st;
> +    if (stat(filename, &st) < 0)
> +        return -1;
> +    return st.st_size;
> +}
> +#endif

You can't assume that the backend is a file, it might as well be an NBD
connection or whatever. Also there should be no reason for having
platform dependent code in a VMDK driver.

Use bdrv_getlength() instead.

> +
>  static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
>  {
>      char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
>      char *p_name, *tmp_str;
> -
> -    /* the descriptor offset = 0x200 */
> -    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
> +    BDRVVmdkState * s = bs->opaque;
> +    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
> +    int desc_size = s->has_descriptor_file ?
> +        get_file_size(bs->file->filename) : DESC_SIZE;
> +    memset(desc, 0, DESC_SIZE);
> +    if (bdrv_pread(bs->file, desc_offset, desc, desc_size) != desc_size)
>          return -1;
> -
>      tmp_str = strstr(desc,"parentCID");
>      pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
>      if ((p_name = strstr(desc,"CID")) != NULL) {
> @@ -149,21 +203,20 @@ static int vmdk_write_cid(BlockDriverState *bs,
> uint32_t cid)
>          pstrcat(desc, sizeof(desc), tmp_desc);
>      }
> 
> -    if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
> +    if (bdrv_pwrite_sync(bs->file, desc_offset, desc, desc_size) < 0)
>          return -1;
>      return 0;
>  }
> 
> -static int vmdk_is_cid_valid(BlockDriverState *bs)
> +static int vmdk_is_cid_valid(BlockDriverState *bs, VmdkExtent *extent)
>  {
>  #ifdef CHECK_CID
> -    BDRVVmdkState *s = bs->opaque;
>      BlockDriverState *p_bs = bs->backing_hd;
>      uint32_t cur_pcid;
> 
>      if (p_bs) {
> -        cur_pcid = vmdk_read_cid(p_bs,0);
> -        if (s->parent_cid != cur_pcid)
> +        cur_pcid = vmdk_read_cid(p_bs, 0);
> +        if (extent->parent_cid != cur_pcid)
>              // CID not valid
>              return 0;
>      }
> @@ -198,7 +251,9 @@ static int vmdk_snapshot_create(const char
> *filename, const char *backing_file)
>      "#DDB\n"
>      "\n";
> 
> -    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY |
> O_LARGEFILE, 0644);
> +    snp_fd = open(filename,
> +                O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
> +                0644);

This hunk looks like it's not required for monolithic flat support.

>      if (snp_fd < 0)
>          return -errno;
>      p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
> @@ -277,7 +332,8 @@ static int vmdk_snapshot_create(const char
> *filename, const char *backing_file)
>       * Each GDE span 32M disk, means:
>       * 512 GTE per GT, each GTE points to grain
>       */
> -    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity *
> SECTOR_SIZE;
> +    gt_size =
> +        (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;

Same here.

>      if (!gt_size) {
>          ret = -EINVAL;
>          goto fail;
> @@ -338,9 +394,9 @@ static int vmdk_parent_open(BlockDriverState *bs)
>  {
>      char *p_name;
>      char desc[DESC_SIZE];
> -
> -    /* the descriptor offset = 0x200 */
> -    if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
> +    BDRVVmdkState *s = bs->opaque;
> +    int desc_offset = s->has_descriptor_file ? 0 : 0x200;
> +    if (bdrv_pread(bs->file, desc_offset, desc, DESC_SIZE) != DESC_SIZE)
>          return -1;

Please add braces.

> 
>      if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
> @@ -358,106 +414,242 @@ static int vmdk_parent_open(BlockDriverState *bs)
>      return 0;
>  }
> 
> +/* find an option value out of descriptor file */
> +static int vmdk_parse_description(const char *desc, const char *opt_name,
> +        char *buf, int buf_size)
> +{
> +    char *opt_pos = strstr(desc, opt_name);
> +    int r;
> +    if (!opt_pos) return -1;

if (!opt_pos) {
    return -1;
}

> +    opt_pos += strlen(opt_name) + 2;
> +    r = sscanf(opt_pos, "%[^\"]s", buf);
> +    assert(r <= buf_size);
> +    return r <= 0;
> +}
> +
> +static int vmdk_parse_extents(const char *desc, VmdkExtent extents[],
> +        const char *desc_file_path)
> +{
> +    const char *p = desc;
> +    int ret = 0;
> +    int r;
> +    while (*p) {
> +        if (strncmp(p, "RW", strlen("RW")) == 0) {
> +            /* parse extent line:
> +             * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
> +             * or
> +             * RW [size in sectors] SPARSE "file-name.vmdk"
> +             */
> +            char access[11] = "";
> +            int64_t sectors = 0;
> +            char type[11] = "";
> +            char fname[512] = "";
> +            sscanf(p, "%10s %lld %10s \"%[^\"]512s\"",
> +                    access, &sectors, type, fname);
> +            if (!(strlen(access) && sectors && strlen(type)
> +                        && strlen(fname)))
> +                goto cont;
> +            if (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) goto cont;
> +            if (strcmp(access, "RW")) goto cont;
> +
> +            ret++;
> +            if (!extents) goto cont;
> +
> +            /* save to extents array */
> +            if (!strcmp(type, "FLAT")) {
> +                /* FLAT extent */
> +                char extent_path[1024];
> +                path_combine(extent_path, sizeof(extent_path),
> +                        desc_file_path, fname);
> +                VmdkExtent *ext = &extents[ret - 1];
> +                ext->flat = 1;
> +                ext->sectors = sectors;
> +                ext->cluster_sectors = sectors;
> +                ext->file = bdrv_new("");
> +                if (!ext->file) return -1;
> +                r = bdrv_open(ext->file,
> +                    extent_path,
> +                    BDRV_O_RDWR | BDRV_O_NO_BACKING,
> +                    NULL);
> +                if (r) return -1;
> +            } else {
> +                /* SPARSE extent, should not be here */
> +                assert(0 && "not supported");
> +            }
> +
> +        }
> +cont:
> +        /* move to next line */
> +        while (*p && *p != '\n') p++;
> +        p++;
> +
> +    }
> +    return ret;
> +}

Please check the whole function for conformance with CODING_STYLE.

> +
>  static int vmdk_open(BlockDriverState *bs, int flags)
>  {
>      BDRVVmdkState *s = bs->opaque;
>      uint32_t magic;
>      int l1_size, i;
> -
> +    VmdkExtent *extent = NULL;
>      if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
>          goto fail;
> 
>      magic = be32_to_cpu(magic);
>      if (magic == VMDK3_MAGIC) {
>          VMDK3Header header;
> -
> -        if (bdrv_pread(bs->file, sizeof(magic), &header,
> sizeof(header)) != sizeof(header))
> +        s->extents = qemu_mallocz(sizeof(VmdkExtent));
> +        s->extent_size = 1;
> +        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
> +            != sizeof(header))
>              goto fail;
> -        s->cluster_sectors = le32_to_cpu(header.granularity);
> -        s->l2_size = 1 << 9;
> -        s->l1_size = 1 << 6;
> -        bs->total_sectors = le32_to_cpu(header.disk_sectors);
> -        s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
> -        s->l1_backup_table_offset = 0;
> -        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
> +        extent = s->extents;
> +        extent->flat = 0;
> +        extent->file = bs->file;
> +        extent->cluster_sectors = le32_to_cpu(header.granularity);
> +        extent->l2_size = 1 << 9;
> +        extent->l1_size = 1 << 6;
> +        extent->sectors = le32_to_cpu(header.disk_sectors);
> +        extent->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
> +        extent->l1_backup_table_offset = 0;
> +        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
>      } else if (magic == VMDK4_MAGIC) {
>          VMDK4Header header;
> -
> -        if (bdrv_pread(bs->file, sizeof(magic), &header,
> sizeof(header)) != sizeof(header))
> +        s->extents = qemu_mallocz(sizeof(VmdkExtent));
> +        s->extent_size = 1;
> +        if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
> +            != sizeof(header))
>              goto fail;
> -        bs->total_sectors = le64_to_cpu(header.capacity);
> -        s->cluster_sectors = le64_to_cpu(header.granularity);
> -        s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
> -        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
> -        if (s->l1_entry_sectors <= 0)
> +        extent = s->extents;
> +        extent->file = bs->file;
> +        extent->sectors = le64_to_cpu(header.capacity);
> +        extent->cluster_sectors = le64_to_cpu(header.granularity);
> +        extent->l2_size = le32_to_cpu(header.num_gtes_per_gte);
> +        extent->l1_entry_sectors = extent->l2_size * extent->cluster_sectors;
> +        if (extent->l1_entry_sectors <= 0)
>              goto fail;
> -        s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
> -            / s->l1_entry_sectors;
> -        s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
> -        s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
> +        extent->l1_size = (extent->sectors + extent->l1_entry_sectors - 1)
> +            / extent->l1_entry_sectors;
> +        extent->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
> +        extent->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
> 
>          // try to open parent images, if exist
>          if (vmdk_parent_open(bs) != 0)
>              goto fail;
>          // write the CID once after the image creation
> -        s->parent_cid = vmdk_read_cid(bs,1);
> +        extent->parent_cid = vmdk_read_cid(bs,1);
>      } else {
> +        char buf[2048];
> +        char ct[128];
> +        if (bdrv_pread(bs->file, 0, buf, sizeof(buf)) == 0)
> +            goto fail;
> +        if (0 != vmdk_parse_description(buf, "createType", ct, sizeof(ct)))
> +            goto fail;
> +        if (0 != strcmp(ct, "monolithicFlat"))
> +            goto fail;
> +        bs->disk_size += get_file_size(bs->file->filename);
> +        s->has_descriptor_file = 1;
> +        s->extent_size = vmdk_parse_extents(buf, NULL, NULL);
> +        if  (!s->extent_size)
> +            goto fail;
> +        s->extents = qemu_mallocz(s->extent_size * sizeof(VmdkExtent));
> +        extent = s->extents;
> +        vmdk_parse_extents(buf, s->extents, bs->file->filename);
> +        bs->total_sectors = 0;
> +
> +        // try to open parent images, if exist
> +        if (vmdk_parent_open(bs) != 0)
>          goto fail;
> +        // write the CID once after the image creation
> +        extent->parent_cid = vmdk_read_cid(bs,1);
> +
> +        for (i = 0; i < s->extent_size; i++)
> +        {
> +            bs->disk_size += get_file_size(s->extents[i].file->filename);
> +            bs->total_sectors += s->extents[i].sectors;
> +        }
> +        return 0;
> +    }

Would it make sense to split out opening the different image subformats
in separate functions?

> +
> +    bs->total_sectors = 0;
> +    for (i = 0; i < s->extent_size; i++)
> +    {
> +        bs->disk_size += get_file_size(s->extents[i].file->filename);
> +        bs->total_sectors += s->extents[i].sectors;
>      }
> 
>      /* read the L1 table */
> -    l1_size = s->l1_size * sizeof(uint32_t);
> -    s->l1_table = qemu_malloc(l1_size);
> -    if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
> l1_size) != l1_size)
> +    l1_size = extent->l1_size * sizeof(uint32_t);
> +    extent->l1_table = qemu_malloc(l1_size);
> +    if (bdrv_pread(bs->file,
> +            extent->l1_table_offset,
> +            extent->l1_table,
> +            l1_size)
> +        != l1_size)
>          goto fail;
> -    for(i = 0; i < s->l1_size; i++) {
> -        le32_to_cpus(&s->l1_table[i]);
> +    for(i = 0; i < extent->l1_size; i++) {
> +        le32_to_cpus(&extent->l1_table[i]);
>      }
> 
> -    if (s->l1_backup_table_offset) {
> -        s->l1_backup_table = qemu_malloc(l1_size);
> -        if (bdrv_pread(bs->file, s->l1_backup_table_offset,
> s->l1_backup_table, l1_size) != l1_size)
> +    if (extent->l1_backup_table_offset) {
> +        extent->l1_backup_table = qemu_malloc(l1_size);
> +        if (bdrv_pread(bs->file,
> +                extent->l1_backup_table_offset,
> +                extent->l1_backup_table,
> +                l1_size)
> +            != l1_size)
>              goto fail;
> -        for(i = 0; i < s->l1_size; i++) {
> -            le32_to_cpus(&s->l1_backup_table[i]);
> +        for(i = 0; i < extent->l1_size; i++) {
> +            le32_to_cpus(&extent->l1_backup_table[i]);
>          }
>      }
> 
> -    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
> +    extent->l2_cache =
> +        qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
>      return 0;
>   fail:
> -    qemu_free(s->l1_backup_table);
> -    qemu_free(s->l1_table);
> -    qemu_free(s->l2_cache);
> +    if (extent) {
> +        qemu_free(extent->l1_backup_table);
> +        qemu_free(extent->l1_table);
> +        qemu_free(extent->l2_cache);
> +    }
>      return -1;
>  }
> 
> -static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData 
> *m_data,
> -                                   uint64_t offset, int allocate);
> -
> -static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
> -                             uint64_t offset, int allocate)
> +static uint64_t get_cluster_offset(BlockDriverState *bs,
> +                    VmdkExtent *extent,
> +                    VmdkMetaData *m_data,
> +                    uint64_t offset,
> +                    int allocate);
> +
> +static int get_whole_cluster(BlockDriverState *bs,
> +                VmdkExtent *extent,
> +                uint64_t cluster_offset,
> +                uint64_t offset,
> +                int allocate)
>  {
> -    BDRVVmdkState *s = bs->opaque;
> -    uint8_t  whole_grain[s->cluster_sectors*512];        // 128
> sectors * 512 bytes each = grain size 64KB
> +    // 128 sectors * 512 bytes each = grain size 64KB
> +    uint8_t  whole_grain[extent->cluster_sectors * 512];
> 
>      // we will be here if it's first write on non-exist grain(cluster).
>      // try to read from parent image, if exist
>      if (bs->backing_hd) {
>          int ret;
> 
> -        if (!vmdk_is_cid_valid(bs))
> +        if (!vmdk_is_cid_valid(bs, extent))
>              return -1;
> -
> +        // floor offset to cluster
> +        offset -= offset % (extent->cluster_sectors * 512);

Hm, is this really related to monolithic flat?

>          ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
> -            s->cluster_sectors);
> +                extent->cluster_sectors);
>          if (ret < 0) {
>              return -1;
>          }
> 
>          //Write grain only into the active image
> -        ret = bdrv_write(bs->file, cluster_offset, whole_grain,
> -            s->cluster_sectors);
> +        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
> +                extent->cluster_sectors);
>          if (ret < 0) {
>              return -1;
>          }
> @@ -465,52 +657,62 @@ static int get_whole_cluster(BlockDriverState
> *bs, uint64_t cluster_offset,
>      return 0;
>  }
> 
> -static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
> +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>  {
> -    BDRVVmdkState *s = bs->opaque;
> -
> +    if (extent->flat) return -1;
>      /* update L2 table */
> -    if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512)
> + (m_data->l2_index * sizeof(m_data->offset)),
> -                    &(m_data->offset), sizeof(m_data->offset)) < 0)
> +    if (bdrv_pwrite_sync(
> +            extent->file,
> +            ((int64_t)m_data->l2_offset * 512)
> +                + (m_data->l2_index * sizeof(m_data->offset)),
> +            &(m_data->offset),
> +            sizeof(m_data->offset)
> +        ) < 0)
>          return -1;
>      /* update backup L2 table */
> -    if (s->l1_backup_table_offset != 0) {
> -        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
> -        if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset *
> 512) + (m_data->l2_index * sizeof(m_data->offset)),
> -                        &(m_data->offset), sizeof(m_data->offset)) < 0)
> +    if (extent->l1_backup_table_offset != 0) {
> +        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
> +        if (bdrv_pwrite_sync(
> +                extent->file,
> +                ((int64_t)m_data->l2_offset * 512)
> +                    + (m_data->l2_index * sizeof(m_data->offset)),
> +                &(m_data->offset), sizeof(m_data->offset)
> +            ) < 0)
>              return -1;
>      }
> 
>      return 0;
>  }
> 
> -static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData 
> *m_data,
> +static uint64_t get_cluster_offset(BlockDriverState *bs,
> +                                    VmdkExtent *extent,
> +                                    VmdkMetaData *m_data,
>                                     uint64_t offset, int allocate)
>  {
> -    BDRVVmdkState *s = bs->opaque;
>      unsigned int l1_index, l2_offset, l2_index;
>      int min_index, i, j;
>      uint32_t min_count, *l2_table, tmp = 0;
>      uint64_t cluster_offset;
> 
> +    if (extent->flat) return 0;
>      if (m_data)
>          m_data->valid = 0;
> 
> -    l1_index = (offset >> 9) / s->l1_entry_sectors;
> -    if (l1_index >= s->l1_size)
> +    l1_index = (offset >> 9) / extent->l1_entry_sectors;
> +    if (l1_index >= extent->l1_size)
>          return 0;
> -    l2_offset = s->l1_table[l1_index];
> +    l2_offset = extent->l1_table[l1_index];
>      if (!l2_offset)
>          return 0;
>      for(i = 0; i < L2_CACHE_SIZE; i++) {
> -        if (l2_offset == s->l2_cache_offsets[i]) {
> +        if (l2_offset == extent->l2_cache_offsets[i]) {
>              /* increment the hit count */
> -            if (++s->l2_cache_counts[i] == 0xffffffff) {
> +            if (++extent->l2_cache_counts[i] == 0xffffffff) {
>                  for(j = 0; j < L2_CACHE_SIZE; j++) {
> -                    s->l2_cache_counts[j] >>= 1;
> +                    extent->l2_cache_counts[j] >>= 1;
>                  }
>              }
> -            l2_table = s->l2_cache + (i * s->l2_size);
> +            l2_table = extent->l2_cache + (i * extent->l2_size);
>              goto found;
>          }
>      }
> @@ -518,20 +720,24 @@ static uint64_t
> get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
>      min_index = 0;
>      min_count = 0xffffffff;
>      for(i = 0; i < L2_CACHE_SIZE; i++) {
> -        if (s->l2_cache_counts[i] < min_count) {
> -            min_count = s->l2_cache_counts[i];
> +        if (extent->l2_cache_counts[i] < min_count) {
> +            min_count = extent->l2_cache_counts[i];
>              min_index = i;
>          }
>      }
> -    l2_table = s->l2_cache + (min_index * s->l2_size);
> -    if (bdrv_pread(bs->file, (int64_t)l2_offset * 512, l2_table,
> s->l2_size * sizeof(uint32_t)) !=
> -
>   s->l2_size * sizeof(uint32_t))
> +    l2_table = extent->l2_cache + (min_index * extent->l2_size);
> +    if (bdrv_pread(
> +            extent->file,
> +            (int64_t)l2_offset * 512,
> +            l2_table,
> +            extent->l2_size * sizeof(uint32_t)
> +        ) != extent->l2_size * sizeof(uint32_t))
>          return 0;
> 
> -    s->l2_cache_offsets[min_index] = l2_offset;
> -    s->l2_cache_counts[min_index] = 1;
> +    extent->l2_cache_offsets[min_index] = l2_offset;
> +    extent->l2_cache_counts[min_index] = 1;
>   found:
> -    l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
> +    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
>      cluster_offset = le32_to_cpu(l2_table[l2_index]);
> 
>      if (!cluster_offset) {
> @@ -539,8 +745,11 @@ static uint64_t
> get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
>              return 0;
> 
>          // Avoid the L2 tables update for the images that have snapshots.
> -        cluster_offset = bdrv_getlength(bs->file);
> -        bdrv_truncate(bs->file, cluster_offset + (s->cluster_sectors << 9));
> +        cluster_offset = bdrv_getlength(extent->file);
> +        bdrv_truncate(
> +            extent->file,
> +            cluster_offset + (extent->cluster_sectors << 9)
> +        );
> 
>          cluster_offset >>= 9;
>          tmp = cpu_to_le32(cluster_offset);
> @@ -551,7 +760,8 @@ static uint64_t
> get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
>           * This problem may occur because of insufficient space on host disk
>           * or inappropriate VM shutdown.
>           */
> -        if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
> +        if (get_whole_cluster(
> +                bs, extent, cluster_offset, offset, allocate) == -1)
>              return 0;
> 
>          if (m_data) {
> @@ -570,35 +780,68 @@ static int vmdk_is_allocated(BlockDriverState
> *bs, int64_t sector_num,
>                               int nb_sectors, int *pnum)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -    int index_in_cluster, n;
> -    uint64_t cluster_offset;
> -
> -    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
> -    index_in_cluster = sector_num % s->cluster_sectors;
> -    n = s->cluster_sectors - index_in_cluster;
> +    int ext_idx = 0;
> +    int index_in_cluster, n, ret;
> +    uint64_t offset;
> +    VmdkExtent *extent;

> +    while (ext_idx < s->extent_size
> +            && sector_num >= s->extents[ext_idx].sectors) {
> +        sector_num -= s->extents[ext_idx].sectors;
> +        ext_idx++;
> +    }

This looks like a pattern that you'll need in more places. Probably
makes sense to turn it into a function.

> +    if (ext_idx == s->extent_size) return 0;
> +    extent = &s->extents[ext_idx];
> +    if (s->extents[ext_idx].flat) {
> +        n = extent->sectors - sector_num;
> +        ret = 1;
> +    } else {
> +        offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
> +        index_in_cluster = sector_num % extent->cluster_sectors;
> +        n = extent->cluster_sectors - index_in_cluster;
> +        ret = offset ? 1 : 0;
> +    }
>      if (n > nb_sectors)
>          n = nb_sectors;
>      *pnum = n;
> -    return (cluster_offset != 0);
> +    return ret;
>  }
> 
>  static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
>                      uint8_t *buf, int nb_sectors)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -    int index_in_cluster, n, ret;
> +    int ext_idx = 0;
> +    int ret;
> +    uint64_t n, index_in_cluster;
> +    VmdkExtent *extent;
>      uint64_t cluster_offset;
> 
>      while (nb_sectors > 0) {
> -        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
> -        index_in_cluster = sector_num % s->cluster_sectors;
> -        n = s->cluster_sectors - index_in_cluster;
> +        while (ext_idx < s->extent_size
> +                && sector_num >= s->extents[ext_idx].sectors) {
> +            sector_num -= s->extents[ext_idx].sectors;
> +            ext_idx++;
> +        }
> +        if (ext_idx == s->extent_size) {
> +            return -1;
> +        }
> +        extent = &s->extents[ext_idx];
> +        if (extent->flat)
> +            cluster_offset = 0;
> +        else
> +            cluster_offset = get_cluster_offset(
> +                                bs, extent, NULL, sector_num << 9, 0);

I think you should move this if into get_cluster_offset.

> +
> +        cluster_offset =
> +            get_cluster_offset(bs, extent, NULL, sector_num << 9, 0);

Huh, so you call get_cluster_offset twice?

> +        index_in_cluster = sector_num % extent->cluster_sectors;
> +        n = extent->cluster_sectors - index_in_cluster;
>          if (n > nb_sectors)
>              n = nb_sectors;
> -        if (!cluster_offset) {
> +        if (!extent->flat && !cluster_offset) {

Maybe this shows that cluster_offset == 0 for an unallocated cluster
isn't a good convention any more as you add more subformats where a
cluster_offset of 0 is perfectly valid. We could consider changing this
in another patch.

>              // try to read from parent image, if exist
>              if (bs->backing_hd) {
> -                if (!vmdk_is_cid_valid(bs))
> +                if (!vmdk_is_cid_valid(bs, extent))
>                      return -1;
>                  ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
>                  if (ret < 0)
> @@ -607,7 +850,12 @@ static int vmdk_read(BlockDriverState *bs,
> int64_t sector_num,
>                  memset(buf, 0, 512 * n);
>              }
>          } else {
> -            if(bdrv_pread(bs->file, cluster_offset + index_in_cluster
> * 512, buf, n * 512) != n * 512)
> +            if(bdrv_pread(
> +                    extent->file,
> +                    cluster_offset + index_in_cluster * 512,
> +                    buf,
> +                    n * 512
> +                ) != n * 512)
>                  return -1;
>          }
>          nb_sectors -= n;
> @@ -621,11 +869,11 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
>                       const uint8_t *buf, int nb_sectors)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -    VmdkMetaData m_data;
> -    int index_in_cluster, n;
> +    VmdkExtent *extent;
> +    int ext_idx = 0;
> +    int n, index_in_cluster;
>      uint64_t cluster_offset;
> -    static int cid_update = 0;
> -
> +    VmdkMetaData m_data;
>      if (sector_num > bs->total_sectors) {
>          fprintf(stderr,
>                  "(VMDK) Wrong offset: sector_num=0x%" PRIx64
> @@ -635,19 +883,40 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
>      }
> 
>      while (nb_sectors > 0) {
> -        index_in_cluster = sector_num & (s->cluster_sectors - 1);
> -        n = s->cluster_sectors - index_in_cluster;
> +        while (ext_idx < s->extent_size
> +            && sector_num >= s->extents[ext_idx].sectors) {
> +            sector_num -= s->extents[ext_idx].sectors;
> +            ext_idx++;
> +        }
> +        if (ext_idx == s->extent_size) {
> +            return -1;
> +        }
> +        extent = &s->extents[ext_idx];
> +        if (extent->flat)
> +            cluster_offset = 0;
> +        else
> +            cluster_offset = get_cluster_offset(
> +                                bs,
> +                                extent,
> +                                &m_data,
> +                                sector_num << 9, 1);
> +
> +        if (!extent->flat && !cluster_offset)
> +            return -1;
> +        index_in_cluster = sector_num & (extent->cluster_sectors - 1);
> +        n = extent->cluster_sectors - index_in_cluster;
>          if (n > nb_sectors)
>              n = nb_sectors;
> -        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
> -        if (!cluster_offset)
> -            return -1;
> 
> -        if (bdrv_pwrite(bs->file, cluster_offset + index_in_cluster *
> 512, buf, n * 512) != n * 512)
> +        if (bdrv_pwrite(
> +                extent->file,
> +                cluster_offset + index_in_cluster * 512,
> +                buf, n * 512
> +            ) != n * 512)
>              return -1;
> -        if (m_data.valid) {
> +        if (!extent->flat && m_data.valid) {
>              /* update L2 tables */
> -            if (vmdk_L2update(bs, &m_data) == -1)
> +            if (vmdk_L2update(extent, &m_data) == -1)
>                  return -1;
>          }
>          nb_sectors -= n;
> @@ -655,9 +924,9 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
>          buf += n * 512;
> 
>          // update CID on the first write every time the virtual disk is 
> opened
> -        if (!cid_update) {
> +        if (!s->cid_update) {
>              vmdk_write_cid(bs, time(NULL));
> -            cid_update++;
> +            s->cid_update++;
>          }
>      }
>      return 0;
> @@ -665,32 +934,17 @@ static int vmdk_write(BlockDriverState *bs,
> int64_t sector_num,
> 
>  static int vmdk_create(const char *filename, QEMUOptionParameter *options)
>  {
> -    int fd, i;
> +    int fd;
> +    int64_t i;
>      VMDK4Header header;
>      uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
> -    static const char desc_template[] =
> -        "# Disk DescriptorFile\n"
> -        "version=1\n"
> -        "CID=%x\n"
> -        "parentCID=ffffffff\n"
> -        "createType=\"monolithicSparse\"\n"
> -        "\n"
> -        "# Extent description\n"
> -        "RW %" PRId64 " SPARSE \"%s\"\n"
> -        "\n"
> -        "# The Disk Data Base \n"
> -        "#DDB\n"
> -        "\n"
> -        "ddb.virtualHWVersion = \"%d\"\n"
> -        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
> -        "ddb.geometry.heads = \"16\"\n"
> -        "ddb.geometry.sectors = \"63\"\n"
> -        "ddb.adapterType = \"ide\"\n";
> +
>      char desc[1024];
>      const char *real_filename, *temp_str;
>      int64_t total_size = 0;
>      const char *backing_file = NULL;
>      int flags = 0;
> +    int flat = 0;
>      int ret;
> 
>      // Read out options
> @@ -701,16 +955,125 @@ static int vmdk_create(const char *filename,
> QEMUOptionParameter *options)
>              backing_file = options->value.s;
>          } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
>              flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
> +        } else if (!strcmp(options->name, BLOCK_OPT_FLAT)) {
> +            flat = options->value.n;
>          }
>          options++;
>      }
> 
> +    if (flat) {

Make the whole following block a separate function.

> +        const char desc_template[] =
> +        "# Disk DescriptorFile\n"
> +        "version=1\n"
> +        "CID=%x\n"
> +        "parentCID=ffffffff\n"
> +        "createType=\"monolithicFlat\"\n"
> +        "\n"
> +        "# Extent description\n"
> +        "RW %" PRId64 " FLAT \"%s\" 0\n"
> +        "\n"
> +        "# The Disk Data Base \n"
> +        "#DDB\n"
> +        "\n"
> +        "ddb.virtualHWVersion = \"%d\"\n"
> +        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
> +        "ddb.geometry.heads = \"16\"\n"
> +        "ddb.geometry.sectors = \"63\"\n"
> +        "ddb.adapterType = \"ide\"\n";
> +        char ext_filename[1024];
> +        strncpy(ext_filename, filename, 1024);
> +        ext_filename[1023] = '\0';
> +        if (!strcmp(&ext_filename[strlen(ext_filename) - 5], ".vmdk"))
> +            strcpy(&ext_filename[strlen(ext_filename) - 5], "-flat.vmdk");
> +        else
> +            strcat(ext_filename, "-flat.vmdk");
> +        /* create extent first */
> +        fd = open(
> +                ext_filename,
> +                O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
> +                0644);
> +        if (fd < 0)
> +            return -errno;
> +        ret = ftruncate(fd, total_size * 512);
> +        if (ret) goto exit;
> +#if VMDK_FLAT_BACKING
> +        if (backing_file) {
> +            uint8_t buf[512];
> +            BlockDriverState *bf = bdrv_new("");
> +            if (!bf) {
> +                ret = -1;
> +                goto exit;
> +            }
> +            ret = bdrv_open(bf, backing_file, BDRV_O_RDWR, NULL);
> +            if (ret) {
> +                bdrv_delete(bf);
> +                goto exit;
> +            }
> +            for (i = 0; i < total_size; i++) {
> +                int num;
> +                if (bdrv_is_allocated(bf, i, 1, &num)) {
> +                    ret = bdrv_read(bf, i, buf, 1);
> +                    if (ret)
> +                    {
> +                        bdrv_delete(bf);
> +                        goto exit;
> +                    }
> +                    ret = pwrite(fd, buf, 512, i * 512);
> +                    if (ret != 512)
> +                    {
> +                        bdrv_delete(bf);
> +                        goto exit;
> +                    }
> +                }
> +            }
> +        }
> +#endif
> +        close(fd);
> +
> +        /* generate descriptor file */
> +        snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
> +                 total_size, ext_filename,
> +                 (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
> +                 total_size / (int64_t)(63 * 16));
> +        fd = open(
> +                filename,
> +                O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
> +                0644);
> +        if (fd < 0)
> +            return -errno;
> +        ret = qemu_write_full(fd, desc, strlen(desc));
> +        if (ret != strlen(desc)) {
> +            ret = -errno;
> +            goto exit;
> +        }
> +        ret = 0;
> +    } else {
> +        const char desc_template[] =
> +        "# Disk DescriptorFile\n"
> +        "version=1\n"
> +        "CID=%x\n"
> +        "parentCID=ffffffff\n"
> +        "createType=\"monolithicSparse\"\n"
> +        "\n"
> +        "# Extent description\n"
> +        "RW %" PRId64 " SPARSE \"%s\"\n"
> +        "\n"
> +        "# The Disk Data Base \n"
> +        "#DDB\n"
> +        "\n"
> +        "ddb.virtualHWVersion = \"%d\"\n"
> +        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
> +        "ddb.geometry.heads = \"16\"\n"
> +        "ddb.geometry.sectors = \"63\"\n"
> +        "ddb.adapterType = \"ide\"\n";
>      /* XXX: add support for backing file */
>      if (backing_file) {
>          return vmdk_snapshot_create(filename, backing_file);
>      }
> 
> -    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | 
> O_LARGEFILE,
> +        fd = open(
> +            filename,
> +            O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
>                0644);
>      if (fd < 0)
>          return -errno;
> @@ -724,7 +1087,8 @@ static int vmdk_create(const char *filename,
> QEMUOptionParameter *options)
> 
>      grains = (total_size + header.granularity - 1) / header.granularity;
>      gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
> -    gt_count = (grains + header.num_gtes_per_gte - 1) /
> header.num_gtes_per_gte;
> +        gt_count =
> +            (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
>      gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
> 
>      header.desc_offset = 1;
> @@ -807,8 +1171,8 @@ static int vmdk_create(const char *filename,
> QEMUOptionParameter *options)
>          ret = -errno;
>          goto exit;
>      }
> -
>      ret = 0;
> +    }
>  exit:
>      close(fd);
>      return ret;
> @@ -817,14 +1181,20 @@ exit:
>  static void vmdk_close(BlockDriverState *bs)
>  {
>      BDRVVmdkState *s = bs->opaque;
> -
> -    qemu_free(s->l1_table);
> -    qemu_free(s->l2_cache);
> +    if (s->extent_size)
> +        qemu_free(s->extents);

Can you even have a state without an allocated s->extents?

>  }
> 
>  static int vmdk_flush(BlockDriverState *bs)
>  {
> -    return bdrv_flush(bs->file);
> +    int ret = 0;
> +    int i;
> +    BDRVVmdkState *s = bs->opaque;
> +    ret = bdrv_flush(bs->file);
> +    for (i = 0; i <  s->extent_size; i++)
> +        ret |= bdrv_flush(s->extents[i].file);
> +    return ret;
> +
>  }
> 
> 
> @@ -844,6 +1214,11 @@ static QEMUOptionParameter vmdk_create_options[] = {
>          .type = OPT_FLAG,
>          .help = "VMDK version 6 image"
>      },
> +    {
> +        .name = BLOCK_OPT_FLAT,
> +        .type = OPT_FLAG,
> +        .help = "VMDK flat extent image"
> +    },

What are the other subformats that will be added? Do all or most of them
have something like both flat and sparse variants, so that this option
is universally applicable?

I'm asking to make sure that maybe three or four flags really make more
sense than just a subformat name.

Kevin



reply via email to

[Prev in Thread] Current Thread [Next in Thread]