Initial image duplicate removal. Same worksheet only.

This commit is contained in:
John McNamara 2019-12-24 19:16:13 +00:00
parent b0738e2ba9
commit a4f9e5bad0
13 changed files with 178 additions and 2 deletions

1
.indent.pro vendored
View File

@ -104,6 +104,7 @@
-T lxw_hash_table
-T lxw_header_footer_options
-T lxw_heading_pair
-T lxw_image_md5
-T lxw_image_options
-T lxw_merged_range
-T lxw_object_properties

View File

@ -11,6 +11,7 @@ env:
- NO_VALGRIND=1 CFLAGS='-Werror -m32'
- NO_VALGRIND=1 USE_SYSTEM_MINIZIP=1 CFLAGS='-Werror'
- NO_VALGRIND=1 USE_DOUBLE_FUNCTION=1 CFLAGS='-Werror'
- NO_VALGRIND=1 USE_NO_MD5=1 CFLAGS='-Werror'
install:
- sudo pip install pytest

View File

@ -175,6 +175,9 @@ enum lxw_custom_property_types {
LXW_CUSTOM_DATETIME
};
/* Size of MD5 byte arrays. */
#define LXW_MD5_SIZE 16
/* Excel sheetname max of 31 chars. */
#define LXW_SHEETNAME_MAX 31

View File

@ -57,6 +57,7 @@
/* Define the tree.h RB structs for the red-black head types. */
RB_HEAD(lxw_worksheet_names, lxw_worksheet_name);
RB_HEAD(lxw_chartsheet_names, lxw_chartsheet_name);
RB_HEAD(lxw_image_md5s, lxw_image_md5);
/* Define the queue.h structs for the workbook lists. */
STAILQ_HEAD(lxw_sheets, lxw_sheet);
@ -93,6 +94,14 @@ typedef struct lxw_chartsheet_name {
RB_ENTRY (lxw_chartsheet_name) tree_pointers;
} lxw_chartsheet_name;
/* Struct to represent an image MD5/ID pair. */
typedef struct lxw_image_md5 {
uint32_t id;
unsigned char md5[LXW_MD5_SIZE];
RB_ENTRY (lxw_image_md5) tree_pointers;
} lxw_image_md5;
/* Wrapper around RB_GENERATE_STATIC from tree.h to avoid unused function
* warnings and to avoid portability issues with the _unused attribute. */
#define LXW_RB_GENERATE_WORKSHEET_NAMES(name, type, field, cmp) \
@ -117,6 +126,17 @@ typedef struct lxw_chartsheet_name {
/* Add unused struct to allow adding a semicolon */ \
struct lxw_rb_generate_charsheet_names{int unused;}
#define LXW_RB_GENERATE_IMAGE_MD5S(name, type, field, cmp) \
RB_GENERATE_INSERT_COLOR(name, type, field, static) \
RB_GENERATE_REMOVE_COLOR(name, type, field, static) \
RB_GENERATE_INSERT(name, type, field, cmp, static) \
RB_GENERATE_REMOVE(name, type, field, static) \
RB_GENERATE_FIND(name, type, field, cmp, static) \
RB_GENERATE_NEXT(name, type, field, static) \
RB_GENERATE_MINMAX(name, type, field, static) \
/* Add unused struct to allow adding a semicolon */ \
struct lxw_rb_generate_image_md5s{int unused;}
/**
* @brief Macro to loop over all the worksheets in a workbook.
*
@ -258,6 +278,7 @@ typedef struct lxw_workbook {
struct lxw_chartsheets *chartsheets;
struct lxw_worksheet_names *worksheet_names;
struct lxw_chartsheet_names *chartsheet_names;
struct lxw_image_md5s *image_md5s;
struct lxw_charts *charts;
struct lxw_charts *ordered_charts;
struct lxw_formats *formats;

View File

@ -62,6 +62,7 @@
#define LXW_HEADER_FOOTER_MAX 255
#define LXW_MAX_NUMBER_URLS 65530
#define LXW_PANE_NAME_LENGTH 12 /* bottomRight + 1 */
#define LXW_IMAGE_BUFFER_SIZE 10
/* The Excel 2007 specification says that the maximum number of page
* breaks is 1026. However, in practice it is actually 1023. */
@ -639,6 +640,8 @@ typedef struct lxw_object_properties {
double x_dpi;
double y_dpi;
lxw_chart *chart;
uint8_t is_duplicate;
unsigned char md5[LXW_MD5_SIZE];
STAILQ_ENTRY (lxw_object_properties) list_pointers;
} lxw_object_properties;

View File

@ -265,6 +265,9 @@ _write_image_files(lxw_packager *self)
STAILQ_FOREACH(object_props, worksheet->image_props, list_pointers) {
if (object_props->is_duplicate)
continue;
lxw_snprintf(filename, LXW_FILENAME_LENGTH,
"xl/media/image%d.%s", index++,
object_props->extension);

View File

@ -17,11 +17,15 @@ STATIC int _worksheet_name_cmp(lxw_worksheet_name *name1,
lxw_worksheet_name *name2);
STATIC int _chartsheet_name_cmp(lxw_chartsheet_name *name1,
lxw_chartsheet_name *name2);
STATIC int _image_md5_cmp(lxw_image_md5 *tuple1, lxw_image_md5 *tuple2);
#ifndef __clang_analyzer__
LXW_RB_GENERATE_WORKSHEET_NAMES(lxw_worksheet_names, lxw_worksheet_name,
tree_pointers, _worksheet_name_cmp);
LXW_RB_GENERATE_CHARTSHEET_NAMES(lxw_chartsheet_names, lxw_chartsheet_name,
tree_pointers, _chartsheet_name_cmp);
LXW_RB_GENERATE_IMAGE_MD5S(lxw_image_md5s, lxw_image_md5,
tree_pointers, _image_md5_cmp);
#endif
/*
@ -49,6 +53,12 @@ _chartsheet_name_cmp(lxw_chartsheet_name *name1, lxw_chartsheet_name *name2)
return lxw_strcasecmp(name1->name, name2->name);
}
STATIC int
_image_md5_cmp(lxw_image_md5 *tuple1, lxw_image_md5 *tuple2)
{
return memcmp(tuple1->md5, tuple2->md5, LXW_MD5_SIZE);
}
/*
* Free workbook properties.
*/
@ -97,6 +107,8 @@ lxw_workbook_free(lxw_workbook *workbook)
struct lxw_worksheet_name *next_worksheet_name;
struct lxw_chartsheet_name *chartsheet_name;
struct lxw_chartsheet_name *next_chartsheet_name;
struct lxw_image_md5 *image_md5;
struct lxw_image_md5 *next_image_md5;
lxw_chart *chart;
lxw_format *format;
lxw_defined_name *defined_name;
@ -204,6 +216,19 @@ lxw_workbook_free(lxw_workbook *workbook)
free(workbook->chartsheet_names);
}
if (workbook->image_md5s) {
for (image_md5 = RB_MIN(lxw_image_md5s, workbook->image_md5s);
image_md5; image_md5 = next_image_md5) {
next_image_md5 =
RB_NEXT(lxw_image_md5s, workbook->image_md5, image_md5);
RB_REMOVE(lxw_image_md5s, workbook->image_md5s, image_md5);
free(image_md5);
}
free(workbook->image_md5s);
}
lxw_hash_free(workbook->used_xf_formats);
lxw_sst_free(workbook->sst);
free(workbook->options.tmpdir);
@ -895,8 +920,12 @@ _prepare_drawings(lxw_workbook *self)
lxw_object_properties *object_props;
uint32_t chart_ref_id = 0;
uint32_t image_ref_id = 0;
uint32_t ref_id = 0;
uint32_t drawing_id = 0;
uint8_t is_chartsheet;
lxw_image_md5 tmp_image_md5;
lxw_image_md5 *new_image_md5 = NULL;
lxw_image_md5 *found;
STAILQ_FOREACH(sheet, self->sheets, list_pointers) {
if (sheet->is_chartsheet) {
@ -925,9 +954,32 @@ _prepare_drawings(lxw_workbook *self)
if (object_props->image_type == LXW_IMAGE_BMP)
self->has_bmp = LXW_TRUE;
image_ref_id++;
memcpy(tmp_image_md5.md5, object_props->md5, LXW_MD5_SIZE);
lxw_worksheet_prepare_image(worksheet, image_ref_id, drawing_id,
found = RB_FIND(lxw_image_md5s, self->image_md5s, &tmp_image_md5);
if (found) {
ref_id = found->id;
object_props->is_duplicate = LXW_TRUE;
}
else {
image_ref_id++;
ref_id = image_ref_id;
#ifndef USE_NO_MD5
new_image_md5 = calloc(1, sizeof(lxw_image_md5));
#endif
if (new_image_md5) {
new_image_md5->id = ref_id;
memcpy(new_image_md5->md5, object_props->md5,
LXW_MD5_SIZE);
RB_INSERT(lxw_image_md5s, self->image_md5s,
new_image_md5);
}
}
lxw_worksheet_prepare_image(worksheet, ref_id, drawing_id,
object_props);
}
@ -1438,6 +1490,11 @@ workbook_new_opt(const char *filename, lxw_workbook_options *options)
GOTO_LABEL_ON_MEM_ERROR(workbook->chartsheet_names, mem_error);
RB_INIT(workbook->chartsheet_names);
/* Add the image MD5 tree. */
workbook->image_md5s = calloc(1, sizeof(struct lxw_image_md5s));
GOTO_LABEL_ON_MEM_ERROR(workbook->image_md5s, mem_error);
RB_INIT(workbook->image_md5s);
/* Add the charts list. */
workbook->charts = calloc(1, sizeof(struct lxw_charts));
GOTO_LABEL_ON_MEM_ERROR(workbook->charts, mem_error);

View File

@ -12,6 +12,7 @@
#include "xlsxwriter/format.h"
#include "xlsxwriter/utility.h"
#include "xlsxwriter/relationships.h"
#include "xlsxwriter/third_party/md5.h"
#define LXW_STR_MAX 32767
#define LXW_BUFFER_SIZE 4096
@ -2541,6 +2542,11 @@ STATIC lxw_error
_get_image_properties(lxw_object_properties *image_props)
{
unsigned char signature[4];
#ifndef USE_NO_MD5
MD5_CTX context;
size_t size_read;
char buffer[LXW_IMAGE_BUFFER_SIZE];
#endif
/* Read 4 bytes to look for the file header/signature. */
if (fread(signature, 1, 4, image_props->stream) < 4) {
@ -2569,6 +2575,20 @@ _get_image_properties(lxw_object_properties *image_props)
return LXW_ERROR_IMAGE_DIMENSIONS;
}
#ifndef USE_NO_MD5
rewind(image_props->stream);
MD5_Init(&context);
size_read = fread(buffer, 1, LXW_IMAGE_BUFFER_SIZE, image_props->stream);
while (size_read) {
MD5_Update(&context, buffer, size_read);
size_read =
fread(buffer, 1, LXW_IMAGE_BUFFER_SIZE, image_props->stream);
}
MD5_Final(image_props->md5, &context);
#endif
return LXW_NO_ERROR;
}

View File

@ -0,0 +1,22 @@
/*****************************************************************************
* Test cases for libxlsxwriter.
*
* Test to compare output against Excel files.
*
* Copyright 2014-2019, John McNamara, jmcnamara@cpan.org
*
*/
#include "xlsxwriter.h"
int main() {
lxw_workbook *workbook = workbook_new("test_image48.xlsx");
lxw_worksheet *worksheet1 = workbook_add_worksheet(workbook, NULL);
lxw_worksheet *worksheet2 = workbook_add_worksheet(workbook, NULL);
worksheet_insert_image(worksheet1, CELL("E9"), "images/red.png");
worksheet_insert_image(worksheet2, CELL("E9"), "images/red.png");
return workbook_close(workbook);
}

View File

@ -0,0 +1,35 @@
/*****************************************************************************
* Test cases for libxlsxwriter.
*
* Test to compare output against Excel files.
*
* Copyright 2014-2019, John McNamara, jmcnamara@cpan.org
*
*/
#include "xlsxwriter.h"
int main() {
lxw_workbook *workbook = workbook_new("test_image49.xlsx");
lxw_worksheet *worksheet1 = workbook_add_worksheet(workbook, NULL);
lxw_worksheet *worksheet2 = workbook_add_worksheet(workbook, NULL);
lxw_worksheet *worksheet3 = workbook_add_worksheet(workbook, NULL);
worksheet_insert_image(worksheet1, CELL("A1"), "images/blue.png");
worksheet_insert_image(worksheet1, CELL("B3"), "images/red.jpg");
worksheet_insert_image(worksheet1, CELL("D5"), "images/yellow.jpg");
worksheet_insert_image(worksheet1, CELL("F9"), "images/grey.png");
worksheet_insert_image(worksheet2, CELL("A1"), "images/blue.png");
worksheet_insert_image(worksheet2, CELL("B3"), "images/red.jpg");
worksheet_insert_image(worksheet2, CELL("D5"), "images/yellow.jpg");
worksheet_insert_image(worksheet2, CELL("F9"), "images/grey.png");
worksheet_insert_image(worksheet3, CELL("A1"), "images/blue.png");
worksheet_insert_image(worksheet3, CELL("B3"), "images/red.jpg");
worksheet_insert_image(worksheet3, CELL("D5"), "images/yellow.jpg");
worksheet_insert_image(worksheet3, CELL("F9"), "images/grey.png");
return workbook_close(workbook);
}

View File

@ -5,6 +5,8 @@
# Copyright 2014-2019, John McNamara, jmcnamara@cpan.org
#
import os
import pytest
import base_test_class
class TestCompareXLSXFiles(base_test_class.XLSXBaseTest):
@ -125,6 +127,14 @@ class TestCompareXLSXFiles(base_test_class.XLSXBaseTest):
def test_image47(self):
self.run_exe_test('test_image47')
@pytest.mark.skipif(os.environ.get('USE_NO_MD5'), reason="compiled without MD5 support")
def test_image48(self):
self.run_exe_test('test_image48')
@pytest.mark.skipif(os.environ.get('USE_NO_MD5'), reason="compiled without MD5 support")
def test_image49(self):
self.run_exe_test('test_image49')
# Test in-memory image handling.
def test_image81(self):
self.run_exe_test('test_image81', 'image01.xlsx')

Binary file not shown.

Binary file not shown.