qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support


From: Milos Vyletel
Subject: [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
Date: Wed, 6 Aug 2014 16:57:58 -0400

VMDK's streamOptimized format is different from regular sparse format.
L1(GD) and L2(GT) tables are not predefined but rather generated and
written during image creation mainly because there is no way to tell
how much space data will occupy once they are compressed. Also the
location of header, L1 and L2 tables differ.

- L2 tables (grain tables) are written after all grains they point to
- L1 tables are written after all grains and L2 tables
- footer at the end is used instead of header in first sector

Images generated by qemu-img could not be imported (as part of OVA archive)
to neither VMWare nor OVM because of errors.

- VMWare during OVA import:
Not a supported disk format (sparse VMDK too old)

- OVM's vbox-img during conversion:
vbox-img: error: Error while copying the image: VERR_EOF

This patch fixes streamOptimized support in qemu which was not fully
compatible with VMDK specifications as defined in latest avaialble version
at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.

Qemu generated images are identical to the ones generated by VMWare and
OVM (vbox-img) with the exception of DescriptorFile but that is expected
(CID and some additional DDB entries differ). They were also succesfully
imported to VMWare vCloud, ESXi and Oracle OVM.

Signed-off-by: Milos Vyletel <address@hidden>
---
v2 changes:
- updated commit message description with errors received
- style/grammar fixes (clean checkpatch pass)
- removed l2_table pointer from VmdkExtent struct
- fixed memory leak in vmdk_write_footer()

v3 changes:
- removed footer from VmdkExtent structure
- split added vmdk_write_grain_directory function to separate GD and footer
  writes
- fix possible problems with opening of images created by older implementation
 block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 280 insertions(+), 75 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 0517bba..3ea1c31 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -81,6 +81,21 @@ typedef struct {
     uint16_t compressAlgorithm;
 } QEMU_PACKED VMDK4Header;
 
+typedef struct {
+    uint64_t val;
+    uint32_t size;
+    uint32_t type;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
+} QEMU_PACKED VMDK4MetaMarker;
+
+typedef struct {
+    VMDK4MetaMarker footer_marker;
+    uint32_t magic;
+    VMDK4Header header;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
+    VMDK4MetaMarker eos_marker;
+} QEMU_PACKED VMDK4Footer;
+
 #define L2_CACHE_SIZE 16
 
 typedef struct VmdkExtent {
@@ -89,12 +104,14 @@ typedef struct VmdkExtent {
     bool compressed;
     bool has_marker;
     bool has_zero_grain;
+    bool has_footer;
     int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
+    uint32_t l1_index;
     uint32_t *l1_table;
     uint32_t *l1_backup_table;
     unsigned int l1_size;
@@ -125,7 +142,6 @@ typedef struct BDRVVmdkState {
 
 typedef struct VmdkMetaData {
     uint32_t offset;
-    unsigned int l1_index;
     unsigned int l2_index;
     unsigned int l2_offset;
     int valid;
@@ -555,14 +571,50 @@ static char *vmdk_read_desc(BlockDriverState *file, 
uint64_t desc_offset,
     return buf;
 }
 
+static int vmdk_read_footer(BlockDriverState *bs,
+                            VMDK4Footer *footer)
+{
+    int ret;
+
+    /*
+    * footer starts 3 sectors from end
+    * - footer marker
+    * - footer
+    * - end-of-stream marker
+    */
+    ret = bdrv_pread(bs->file,
+        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
+        footer, sizeof(*footer));
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Some sanity checks for the footer */
+    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
+        le32_to_cpu(footer->footer_marker.size) != 0  ||
+        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
+        le64_to_cpu(footer->eos_marker.val) != 0  ||
+        le32_to_cpu(footer->eos_marker.size) != 0  ||
+        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = VMDK_OK;
+ out:
+    return ret;
+}
+
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
                            int flags, Error **errp)
 {
     int ret;
+    bool has_footer = false;
     uint32_t magic;
     uint32_t l1_size, l1_entry_sectors;
     VMDK4Header header;
+    VMDK4Footer footer;
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
     int64_t l1_backup_offset = 0;
@@ -593,48 +645,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
 
     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
         /*
-         * The footer takes precedence over the header, so read it in. The
-         * footer starts at offset -1024 from the end: One sector for the
-         * footer, and another one for the end-of-stream marker.
+         * The footer takes precedence over the header, so read it in.
          */
-        struct {
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED footer_marker;
-
-            uint32_t magic;
-            VMDK4Header header;
-            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
-
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED eos_marker;
-        } QEMU_PACKED footer;
-
-        ret = bdrv_pread(file,
-            bs->file->total_sectors * 512 - 1536,
-            &footer, sizeof(footer));
+        ret = vmdk_read_footer(bs, &footer);
         if (ret < 0) {
             return ret;
         }
-
-        /* Some sanity checks for the footer */
-        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
-            le32_to_cpu(footer.footer_marker.size) != 0  ||
-            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
-            le64_to_cpu(footer.eos_marker.val) != 0  ||
-            le32_to_cpu(footer.eos_marker.size) != 0  ||
-            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
-        {
-            return -EINVAL;
-        }
-
+        has_footer = true;
         header = footer.header;
     }
 
@@ -645,11 +662,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
                   bs->device_name, "vmdk", buf);
         return -ENOTSUP;
-    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+    } else if (le32_to_cpu(header.version) == 3 &&
+               (flags & BDRV_O_RDWR) &&
+               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
         /* VMware KB 2064959 explains that version 3 added support for
          * persistent changed block tracking (CBT), and backup software can
          * read it as version=1 if it doesn't care about the changed area
-         * information. So we are safe to enable read only. */
+         * information. So we are safe to enable read only.
+         * Note that this does not apply to streamOptimized images which
+         * are written only once and are used as transport format */
         error_setg(errp, "VMDK version 3 must be read only");
         return -EINVAL;
     }
@@ -689,11 +710,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (ret < 0) {
         return ret;
     }
+    if (has_footer) {
+        extent->has_footer = has_footer;
+    }
+
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
     if (extent->compressed) {
         g_free(s->create_type);
         s->create_type = g_strdup("streamOptimized");
+
+        if (extent->has_footer && (flags & BDRV_O_RDWR)) {
+            bdrv_truncate(file,
+                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
+        }
     }
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
     extent->version = le32_to_cpu(header.version);
@@ -998,6 +1028,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data)
     uint32_t offset;
     QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
     offset = cpu_to_le32(m_data->offset);
+
+    /* nothing to update on streamOptimized */
+    if (extent->compressed) {
+        return VMDK_OK;
+    }
+
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
@@ -1008,7 +1044,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data)
     }
     /* update backup L2 table */
     if (extent->l1_backup_table_offset != 0) {
-        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
         if (bdrv_pwrite_sync(
                     extent->file,
                     ((int64_t)m_data->l2_offset * 512)
@@ -1024,6 +1060,108 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data)
     return VMDK_OK;
 }
 
+static int vmdk_write_footer(BlockDriverState *bs, VMDK4Footer *footer)
+{
+    int ret;
+    uint64_t offset;
+    uint32_t grains, gt_count, gd_sectors;
+
+    if (!footer) {
+        return -EINVAL;
+    }
+
+    grains = DIV_ROUND_UP(le64_to_cpu(footer->header.capacity),
+                          le64_to_cpu(footer->header.granularity));
+    gt_count = DIV_ROUND_UP(grains,
+                            le32_to_cpu(footer->header.num_gtes_per_gt));
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
+
+    offset = le64_to_cpu(footer->header.gd_offset) + gd_sectors;
+    footer->footer_marker.val = cpu_to_le64(1);
+    footer->footer_marker.type = cpu_to_le32(MARKER_FOOTER);
+    footer->magic = cpu_to_be32(VMDK4_MAGIC);
+    footer->eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
+
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return VMDK_OK;
+}
+
+static int vmdk_write_grain_directory(VmdkExtent *extent)
+{
+    int i, ret, gd_buf_size;
+    uint32_t *gd_buf = NULL, gd_sectors;
+    VMDK4MetaMarker gd_marker;
+
+    /* write grain directory marker */
+    memset(&gd_marker, 0, sizeof(gd_marker));
+    gd_sectors = DIV_ROUND_UP(extent->l1_size * sizeof(uint32_t),
+                              BDRV_SECTOR_SIZE);
+    gd_marker.val = cpu_to_le64(gd_sectors);
+    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
+    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
+                      &gd_marker, sizeof(gd_marker));
+    if (ret < 0) {
+        return ret;
+    }
+    extent->l1_table_offset += sizeof(gd_marker);
+
+    /* write grain directory */
+    gd_buf_size = extent->l1_size * sizeof(uint32_t);
+    gd_buf = g_malloc0(gd_buf_size);
+    for (i = 0; i < extent->l1_size; i++) {
+        gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
+    }
+    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    ret = VMDK_OK;
+ exit:
+    g_free(gd_buf);
+    return ret;
+}
+
+static int vmdk_write_grain_table(VmdkExtent *extent)
+{
+    int i;
+    uint32_t *l2_table = NULL;
+    VMDK4MetaMarker gtm;
+
+    for (i = 0; i < L2_CACHE_SIZE; i++) {
+        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) 
{
+            l2_table = extent->l2_cache + (i * extent->l2_size);
+        }
+    }
+    if (!l2_table) {
+        return -EINVAL;
+    }
+
+    memset(&gtm, 0, sizeof(gtm));
+    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
+    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
+    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
+                    &gtm, sizeof(gtm)) != sizeof(gtm)) {
+        return -EIO;
+    }
+    extent->l1_table_offset += sizeof(gtm);
+
+    extent->l1_table[extent->l1_index] = extent->l1_table_offset >> 9;
+    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
+                    l2_table, extent->l2_size * sizeof(uint32_t)
+                   ) != extent->l2_size * sizeof(uint32_t)) {
+        return -EIO;
+    }
+    extent->l1_table_offset += extent->l2_size * sizeof(uint32_t);
+
+    return VMDK_OK;
+}
+
 static int get_cluster_offset(BlockDriverState *bs,
                                     VmdkExtent *extent,
                                     VmdkMetaData *m_data,
@@ -1032,7 +1170,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                                     uint64_t *cluster_offset)
 {
     unsigned int l1_index, l2_offset, l2_index;
-    int min_index, i, j;
+    int min_index, i, j, ret;
     uint32_t min_count, *l2_table;
     bool zeroed = false;
 
@@ -1046,6 +1184,22 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
     l1_index = (offset >> 9) / extent->l1_entry_sectors;
+    if (extent->compressed && !extent->l1_table[l1_index]) {
+        if (l1_index) {
+            /* grain (L2) tables follow compressed data so first L2 table will
+             * be written when we move to next index or when we close image.
+             * that is why we need to save l1_index in extent itself for easy
+             * access from both here and vmdk_close */
+            ret = vmdk_write_grain_table(extent);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        /* allocate new L2; set it to GD offset for now */
+        extent->l1_table[l1_index] = extent->l1_table_offset;
+    }
+    extent->l1_index = l1_index;
+
     if (l1_index >= extent->l1_size) {
         return VMDK_ERROR;
     }
@@ -1092,7 +1246,6 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     if (m_data) {
         m_data->valid = 1;
-        m_data->l1_index = l1_index;
         m_data->l2_index = l2_index;
         m_data->offset = *cluster_offset;
         m_data->l2_offset = l2_offset;
@@ -1234,6 +1387,10 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t 
cluster_offset,
         ret = ret < 0 ? ret : -EIO;
         goto out;
     }
+    if (extent->compressed) {
+        /* update GD offset after each write */
+        extent->l1_table_offset = bdrv_getlength(extent->file);
+    }
     ret = 0;
  out:
     g_free(data);
@@ -1532,10 +1689,12 @@ static int vmdk_create_extent(const char *filename, 
int64_t filesize,
     int ret, i;
     BlockDriverState *bs = NULL;
     VMDK4Header header;
+    VMDK4Footer footer;
     Error *local_err = NULL;
     uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
     uint32_t *gd_buf = NULL;
     int gd_buf_size;
+    uint64_t grain_offset, rgd_offset, gd_offset;
 
     ret = bdrv_create_file(filename, opts, &local_err);
     if (ret < 0) {
@@ -1560,28 +1719,38 @@ static int vmdk_create_extent(const char *filename, 
int64_t filesize,
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
-    header.version = zeroed_grain ? 2 : 1;
-    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
-                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+    memset(&footer, 0, sizeof(footer));
+
+    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
+    header.flags = VMDK4_FLAG_NL_DETECT
+                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
+                               : VMDK4_FLAG_RGD)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
     header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
     header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    grains = DIV_ROUND_UP(header.capacity, header.granularity);
     gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                            BDRV_SECTOR_SIZE);
     gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
     gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
-    header.desc_size = 20;
-    header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
-    header.grain_offset =
+    header.desc_size = (compress ? 2 : 20);
+    rgd_offset = header.desc_offset + header.desc_size;
+    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
+    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
+    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
+    grain_offset =
         ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                  header.granularity);
+    /* streamOptimized reserves first 128 sectors */
+    header.grain_offset = (compress ? header.granularity : grain_offset);
+    /* streamOptimzed's grain directory is at the end */
+    gd_offset = header.grain_offset + 1;
+
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1618,30 +1787,44 @@ static int vmdk_create_extent(const char *filename, 
int64_t filesize,
         goto exit;
     }
 
-    /* write grain directory */
-    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
-    gd_buf = g_malloc0(gd_buf_size);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
-    }
+    if (compress) {
+        footer.header = header;
+        footer.header.gd_offset = cpu_to_le64(gd_offset);
 
-    /* write backup grain directory */
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
+        ret = vmdk_write_footer(bs, &footer);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
+    } else {
+        /* write redundant grain directory (if applicable) */
+        if (le64_to_cpu(header.rgd_offset)) {
+            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+            gd_buf = g_malloc0(gd_buf_size);
+            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
+                 i < gt_count; i++, tmp += gt_size) {
+                gd_buf[i] = cpu_to_le32(tmp);
+            }
+            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
+                                  BDRV_SECTOR_SIZE,
+                              gd_buf, gd_buf_size);
+            if (ret < 0) {
+                error_set(errp, QERR_IO_ERROR);
+                goto exit;
+            }
+        }
+
+        /* write grain directory */
+        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
+             i < gt_count; i++, tmp += gt_size) {
+            gd_buf[i] = cpu_to_le32(tmp);
+        }
+        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                          gd_buf, gd_buf_size);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
     }
 
     ret = 0;
@@ -1911,7 +2094,29 @@ exit:
 
 static void vmdk_close(BlockDriverState *bs)
 {
+    int ret;
     BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *extent = &s->extents[0];
+    VMDK4Footer footer;
+
+    if (extent->compressed) {
+        while (extent < &s->extents[s->num_extents]) {
+            vmdk_write_grain_table(extent);
+            vmdk_write_grain_directory(extent);
+            if (extent->has_footer) {
+                memset(&footer, 0, sizeof(footer));
+                ret = bdrv_pread(extent->file, sizeof(uint32_t),
+                                 &footer.header, sizeof(footer.header));
+                if (ret < 0) {
+                    continue;
+                }
+                footer.header.gd_offset =
+                    cpu_to_le64(extent->l1_table_offset >> 9);
+                vmdk_write_footer(extent->file, &footer);
+            }
+            extent++;
+        }
+    }
 
     vmdk_free_extents(bs);
     g_free(s->create_type);
-- 
1.7.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]