Vanilla.PDF  2.0.0
Cross-platform toolkit for creating and modifying PDF documents
extract.c

File image extaction full example.

#include "tools.h"
void print_extract_help() {
printf("Usage: extract -s [source file]");
}
error_type process_stream(StreamObjectHandle* stream, biguint_type object_number, ushort_type generation_number) {
DictionaryObjectHandle* stream_dictionary = NULL;
ObjectType type_object_type;
ObjectType subtype_object_type;
ObjectHandle* type_object = NULL;
ObjectHandle* subtype_object = NULL;
NameObjectHandle* type_name = NULL;
NameObjectHandle* subtype_name = NULL;
boolean_type contains_subtype = VANILLAPDF_RV_FALSE;
boolean_type contains_height = VANILLAPDF_RV_FALSE;
boolean_type contains_colorspace = VANILLAPDF_RV_FALSE;
boolean_type is_type_xobject = VANILLAPDF_RV_FALSE;
boolean_type is_subtype_image = VANILLAPDF_RV_FALSE;
boolean_type processed_with_params = VANILLAPDF_RV_FALSE;
unsigned long long object_number_converted = 0;
unsigned int generation_number_converted = 0;
int return_value = 0;
char output_filename[256] = { 0 };
RETURN_ERROR_IF_NOT_SUCCESS(StreamObject_GetHeader(stream, &stream_dictionary));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Contains(stream_dictionary, NameConstant_Type, &contains_type));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Contains(stream_dictionary, NameConstant_Subtype, &contains_subtype));
if (!contains_type || !contains_subtype) {
goto err;
}
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Find(stream_dictionary, NameConstant_Type, &type_object));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Find(stream_dictionary, NameConstant_Subtype, &subtype_object));
RETURN_ERROR_IF_NOT_SUCCESS(Object_GetObjectType(type_object, &type_object_type));
RETURN_ERROR_IF_NOT_SUCCESS(Object_GetObjectType(subtype_object, &subtype_object_type));
if (type_object_type != ObjectType_Name || subtype_object_type != ObjectType_Name) {
goto err;
}
RETURN_ERROR_IF_NOT_SUCCESS(NameObject_FromObject(type_object, &type_name));
RETURN_ERROR_IF_NOT_SUCCESS(NameObject_FromObject(subtype_object, &subtype_name));
RETURN_ERROR_IF_NOT_SUCCESS(NameObject_Equals(type_name, NameConstant_XObject, &is_type_xobject));
RETURN_ERROR_IF_NOT_SUCCESS(NameObject_Equals(subtype_name, NameConstant_Image, &is_subtype_image));
if (!is_type_xobject || !is_subtype_image) {
goto err;
}
object_number_converted = object_number;
generation_number_converted = generation_number;
return_value = snprintf(output_filename, sizeof(output_filename), "%llu.%u.jpeg", object_number_converted, generation_number_converted);
if (return_value < 0) {
printf("Could not create destination filename");
return VANILLAPDF_TOOLS_ERROR_FAILURE;
}
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Contains(stream_dictionary, NameConstant_Width, &contains_width));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Contains(stream_dictionary, NameConstant_Height, &contains_height));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Contains(stream_dictionary, NameConstant_ColorSpace, &contains_colorspace));
if (contains_width && contains_height && contains_colorspace) {
ObjectType width_object_type;
ObjectType height_object_type;
ObjectType colorspace_object_type;
ObjectHandle* width_object = NULL;
ObjectHandle* height_object = NULL;
ObjectHandle* colorspace_object = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Find(stream_dictionary, NameConstant_Width, &width_object));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Find(stream_dictionary, NameConstant_Height, &height_object));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Find(stream_dictionary, NameConstant_ColorSpace, &colorspace_object));
RETURN_ERROR_IF_NOT_SUCCESS(Object_GetObjectType(width_object, &width_object_type));
RETURN_ERROR_IF_NOT_SUCCESS(Object_GetObjectType(height_object, &height_object_type));
RETURN_ERROR_IF_NOT_SUCCESS(Object_GetObjectType(colorspace_object, &colorspace_object_type));
if (width_object_type == ObjectType_Integer && height_object_type == ObjectType_Integer && colorspace_object_type == ObjectType_Name) {
OutputStreamHandle* output_stream = NULL;
DCTDecodeFilterHandle* encoding_filter = NULL;
DictionaryObjectHandle* encoding_dictionary = NULL;
BufferHandle* decoded_body = NULL;
BufferHandle* encoded_body = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Create(&encoding_dictionary));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_InsertConst(encoding_dictionary, NameConstant_Width, width_object, VANILLAPDF_RV_FALSE));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_InsertConst(encoding_dictionary, NameConstant_Height, height_object, VANILLAPDF_RV_FALSE));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_InsertConst(encoding_dictionary, NameConstant_ColorSpace, colorspace_object, VANILLAPDF_RV_FALSE));
RETURN_ERROR_IF_NOT_SUCCESS(StreamObject_GetBody(stream, &decoded_body));
RETURN_ERROR_IF_NOT_SUCCESS(DCTDecodeFilter_Create(&encoding_filter));
RETURN_ERROR_IF_NOT_SUCCESS(DCTDecodeFilter_EncodeParams(encoding_filter, decoded_body, encoding_dictionary, &encoded_body));
RETURN_ERROR_IF_NOT_SUCCESS(DCTDecodeFilter_Release(encoding_filter));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_CreateFromFile(output_filename, &output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_WriteBuffer(output_stream, encoded_body));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_Flush(output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_Release(output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(Buffer_Release(encoded_body));
RETURN_ERROR_IF_NOT_SUCCESS(Buffer_Release(decoded_body));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Release(encoding_dictionary));
processed_with_params = VANILLAPDF_RV_TRUE;
}
RETURN_ERROR_IF_NOT_SUCCESS(Object_Release(width_object));
RETURN_ERROR_IF_NOT_SUCCESS(Object_Release(height_object));
RETURN_ERROR_IF_NOT_SUCCESS(Object_Release(colorspace_object));
}
if (processed_with_params != VANILLAPDF_RV_TRUE) {
OutputStreamHandle* output_stream = NULL;
BufferHandle* encoded_body = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(StreamObject_GetBodyRaw(stream, &encoded_body));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_CreateFromFile(output_filename, &output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_WriteBuffer(output_stream, encoded_body));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_Flush(output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_Release(output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(Buffer_Release(encoded_body));
}
err:
if (type_object != NULL) {
RETURN_ERROR_IF_NOT_SUCCESS(Object_Release(type_object));
type_object = NULL;
}
if (subtype_object != NULL) {
RETURN_ERROR_IF_NOT_SUCCESS(Object_Release(subtype_object));
subtype_object = NULL;
}
if (type_name != NULL) {
RETURN_ERROR_IF_NOT_SUCCESS(NameObject_Release(type_name));
type_name = NULL;
}
if (subtype_name != NULL) {
RETURN_ERROR_IF_NOT_SUCCESS(NameObject_Release(subtype_name));
subtype_name = NULL;
}
if (stream_dictionary != NULL) {
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Release(stream_dictionary));
stream_dictionary = NULL;
}
return VANILLAPDF_TOOLS_ERROR_SUCCESS;
}
error_type process_object(ObjectHandle* obj, biguint_type object_number, ushort_type generation_number) {
RETURN_ERROR_IF_NOT_SUCCESS(Object_GetObjectType(obj, &type));
if (type == ObjectType_Stream) {
StreamObjectHandle* stream = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(StreamObject_FromObject(obj, &stream));
RETURN_ERROR_IF_NOT_SUCCESS(process_stream(stream, object_number, generation_number));
RETURN_ERROR_IF_NOT_SUCCESS(StreamObject_Release(stream));
}
return VANILLAPDF_TOOLS_ERROR_SUCCESS;
}
error_type process_xref(XrefHandle* xref) {
XrefIteratorHandle* xref_iterator = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(Xref_GetIterator(xref, &xref_iterator));
while (VANILLAPDF_ERROR_SUCCESS == XrefIterator_IsValid(xref_iterator, &valid)
&& VANILLAPDF_RV_TRUE == valid) {
XrefEntryHandle* entry = NULL;
XrefCompressedEntryHandle* compressed_entry = NULL;
XrefUsedEntryHandle* used_entry = NULL;
biguint_type object_number = 0;
ushort_type generation_number = 0;
RETURN_ERROR_IF_NOT_SUCCESS(XrefIterator_GetValue(xref_iterator, &entry));
RETURN_ERROR_IF_NOT_SUCCESS(XrefEntry_GetType(entry, &type));
RETURN_ERROR_IF_NOT_SUCCESS(XrefEntry_GetObjectNumber(entry, &object_number));
RETURN_ERROR_IF_NOT_SUCCESS(XrefEntry_GetGenerationNumber(entry, &generation_number));
if (type == XrefEntryType_Used) {
ObjectHandle* obj = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(XrefUsedEntry_FromEntry(entry, &used_entry));
RETURN_ERROR_IF_NOT_SUCCESS(XrefUsedEntry_GetReference(used_entry, &obj));
RETURN_ERROR_IF_NOT_SUCCESS(process_object(obj, object_number, generation_number));
RETURN_ERROR_IF_NOT_SUCCESS(Object_Release(obj));
RETURN_ERROR_IF_NOT_SUCCESS(XrefUsedEntry_Release(used_entry));
}
if (type == XrefEntryType_Compressed) {
ObjectHandle* obj = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(XrefCompressedEntry_FromEntry(entry, &compressed_entry));
RETURN_ERROR_IF_NOT_SUCCESS(XrefCompressedEntry_GetReference(compressed_entry, &obj));
RETURN_ERROR_IF_NOT_SUCCESS(process_object(obj, object_number, generation_number));
RETURN_ERROR_IF_NOT_SUCCESS(Object_Release(obj));
RETURN_ERROR_IF_NOT_SUCCESS(XrefCompressedEntry_Release(compressed_entry));
}
RETURN_ERROR_IF_NOT_SUCCESS(XrefEntry_Release(entry));
RETURN_ERROR_IF_NOT_SUCCESS(XrefIterator_Next(xref_iterator));
}
RETURN_ERROR_IF_NOT_SUCCESS(XrefIterator_Release(xref_iterator));
return VANILLAPDF_TOOLS_ERROR_SUCCESS;
}
error_type process_file(FileHandle* file) {
XrefChainHandle* chain = NULL;
XrefChainIteratorHandle* chain_iterator = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(File_XrefChain(file, &chain));
RETURN_ERROR_IF_NOT_SUCCESS(XrefChain_GetIterator(chain, &chain_iterator));
while (VANILLAPDF_ERROR_SUCCESS == XrefChainIterator_IsValid(chain_iterator, &valid)
&& VANILLAPDF_RV_TRUE == valid) {
XrefHandle* xref = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(XrefChainIterator_GetValue(chain_iterator, &xref));
RETURN_ERROR_IF_NOT_SUCCESS(process_xref(xref));
RETURN_ERROR_IF_NOT_SUCCESS(Xref_Release(xref));
RETURN_ERROR_IF_NOT_SUCCESS(XrefChainIterator_Next(chain_iterator));
}
RETURN_ERROR_IF_NOT_SUCCESS(XrefChainIterator_Release(chain_iterator));
RETURN_ERROR_IF_NOT_SUCCESS(XrefChain_Release(chain));
return VANILLAPDF_TOOLS_ERROR_SUCCESS;
}
error_type process_page_contents(PageContentsHandle* page_contents, size_type page_number) {
ContentInstructionCollectionHandle* content_instruction_collection = NULL;
size_type i = 0;
size_type contents_size = 0;
unsigned long long page_number_converted = page_number;
RETURN_ERROR_IF_NOT_SUCCESS(PageContents_GetInstructionCollection(page_contents, &content_instruction_collection));
RETURN_ERROR_IF_NOT_SUCCESS(ContentInstructionCollection_GetSize(content_instruction_collection, &contents_size));
for (i = 0; i < contents_size; ++i) {
ContentInstructionType instruction_type = ContentInstructionType_Undefined;
ContentObjectType object_type = ContentObjectType_Undefined;
ContentInstructionHandle* content_instruction = NULL;
ContentObjectHandle* content_object = NULL;
ContentObjectInlineImageHandle* content_image = NULL;
DictionaryObjectHandle* content_image_dictionary = NULL;
BufferHandle* content_image_data = NULL;
int return_value = 0;
unsigned long long i_converted = i;
char output_filename[256] = {0};
OutputStreamHandle* output_stream = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(ContentInstructionCollection_At(content_instruction_collection, i, &content_instruction));
RETURN_ERROR_IF_NOT_SUCCESS(ContentInstruction_GetInstructionType(content_instruction, &instruction_type));
if (instruction_type != ContentInstructionType_Object) {
RETURN_ERROR_IF_NOT_SUCCESS(ContentInstruction_Release(content_instruction));
continue;
}
RETURN_ERROR_IF_NOT_SUCCESS(ContentObject_FromInstruction(content_instruction, &content_object));
RETURN_ERROR_IF_NOT_SUCCESS(ContentObject_GetObjectType(content_object, &object_type));
if (object_type != ContentObjectType_InlineImage) {
RETURN_ERROR_IF_NOT_SUCCESS(ContentObject_Release(content_object));
RETURN_ERROR_IF_NOT_SUCCESS(ContentInstruction_Release(content_instruction));
continue;
}
RETURN_ERROR_IF_NOT_SUCCESS(ContentObject_ToInlineImage(content_object, &content_image));
RETURN_ERROR_IF_NOT_SUCCESS(ContentObjectInlineImage_GetDictionary(content_image, &content_image_dictionary));
RETURN_ERROR_IF_NOT_SUCCESS(ContentObjectInlineImage_GetData(content_image, &content_image_data));
return_value = snprintf(output_filename, sizeof(output_filename), "%llu.%llu", page_number_converted, i_converted);
if (return_value < 0) {
printf("Could not create destination filename");
return VANILLAPDF_TOOLS_ERROR_FAILURE;
}
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_CreateFromFile(output_filename, &output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_WriteBuffer(output_stream, content_image_data));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_Flush(output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(OutputStream_Release(output_stream));
RETURN_ERROR_IF_NOT_SUCCESS(Buffer_Release(content_image_data));
RETURN_ERROR_IF_NOT_SUCCESS(DictionaryObject_Release(content_image_dictionary));
RETURN_ERROR_IF_NOT_SUCCESS(ContentObjectInlineImage_Release(content_image));
RETURN_ERROR_IF_NOT_SUCCESS(ContentObject_Release(content_object));
RETURN_ERROR_IF_NOT_SUCCESS(ContentInstruction_Release(content_instruction));
}
RETURN_ERROR_IF_NOT_SUCCESS(ContentInstructionCollection_Release(content_instruction_collection));
return VANILLAPDF_TOOLS_ERROR_SUCCESS;
}
int process_extract(int argc, char *argv[]) {
const char *filename = NULL;
int arg_counter = 0;
size_type i = 0;
size_type page_count = 0;
FileHandle* file = NULL;
DocumentHandle* document = NULL;
CatalogHandle* catalog = NULL;
PageTreeHandle* tree = NULL;
for (arg_counter = 0; arg_counter < argc; ++arg_counter) {
// source file
if (strcmp(argv[arg_counter], "-s") == 0 && (arg_counter + 1 < argc)) {
filename = argv[arg_counter + 1];
arg_counter++;
} else {
print_extract_help();
return VANILLAPDF_TOOLS_ERROR_INVALID_PARAMETERS;
}
}
if (filename == NULL) {
print_extract_help();
return VANILLAPDF_TOOLS_ERROR_INVALID_PARAMETERS;
}
RETURN_ERROR_IF_NOT_SUCCESS(File_Open(filename, &file));
RETURN_ERROR_IF_NOT_SUCCESS(File_Initialize(file));
RETURN_ERROR_IF_NOT_SUCCESS(process_file(file));
RETURN_ERROR_IF_NOT_SUCCESS(Document_OpenFile(file, &document));
RETURN_ERROR_IF_NOT_SUCCESS(Document_GetCatalog(document, &catalog));
RETURN_ERROR_IF_NOT_SUCCESS(Catalog_GetPages(catalog, &tree));
RETURN_ERROR_IF_NOT_SUCCESS(PageTree_GetPageCount(tree, &page_count));
for (i = 0; i < page_count; ++i) {
PageContentsHandle* page_contents = NULL;
PageObjectHandle* page_object = NULL;
RETURN_ERROR_IF_NOT_SUCCESS(PageTree_GetPage(tree, i + 1, &page_object));
RETURN_ERROR_IF_NOT_SUCCESS(PageObject_GetContents(page_object, &page_contents));
RETURN_ERROR_IF_NOT_SUCCESS(process_page_contents(page_contents, i + 1));
RETURN_ERROR_IF_NOT_SUCCESS(PageContents_Release(page_contents));
RETURN_ERROR_IF_NOT_SUCCESS(PageObject_Release(page_object));
}
RETURN_ERROR_IF_NOT_SUCCESS(PageTree_Release(tree));
RETURN_ERROR_IF_NOT_SUCCESS(Catalog_Release(catalog));
RETURN_ERROR_IF_NOT_SUCCESS(Document_Release(document));
RETURN_ERROR_IF_NOT_SUCCESS(File_Release(file));
return VANILLAPDF_TOOLS_ERROR_SUCCESS;
}
XrefEntryType
Required for conversion to derived types.
Definition c_xref.h:94
@ XrefEntryType_Null
This type is not actually used.
Definition c_xref.h:107
@ XrefEntryType_Compressed
Represents compressed entry within cross-reference section.
Definition c_xref.h:125
@ XrefEntryType_Used
Represents used entry within cross-reference section.
Definition c_xref.h:119
Represents memory stored data.
The root of a document's object hierarchy.
Collection of content instructions.
Base class for all content objects and operations.
A sequence of content instructions grouped within a single object.
As an alternative to the image XObjects described in section 8.9.5 - Image Dictionaries,...
The DCTDecode filter decodes grayscale or colour image data that has been encoded in the JPEG baselin...
A dictionary object is an associative table containing pairs of objects.
Represents high-level file access handle.
Represents low-level file access handle.
A name object is an atomic symbol uniquely defined by a sequence of characters.
Base class for syntactic tokens.
Output stream can write sequences of characters and represent other kinds of data.
A content stream is a PDF stream object whose data consists of a sequence of instructions describing ...
The leaves of the page tree are page objects, each of which is a dictionary specifying the attributes...
The pages of a document are accessed through a structure known as the page tree, which defines the or...
Stream object represents compressed data inside document.
An ordered collection of all XrefHandle within the PDF file.
A pointer to XrefHandle within XrefChainHandle collection.
Represents compressed entry within cross-reference section.
Cross-reference entry represents item within XrefHandle.
The cross-reference table contains information that permits random access to indirect objects within ...
A pointer to XrefEntryHandle within XrefHandle collection.
Represents used entry within cross-reference section.
const boolean_type VANILLAPDF_RV_TRUE
Represents the boolean true value.
const boolean_type VANILLAPDF_RV_FALSE
Represents the boolean false value.
ContentObjectType
Derived types of ContentObjectHandle.
Definition c_content_object.h:49
ContentInstructionType
Available content instruction types.
Definition c_content_instruction.h:43
@ ContentObjectType_InlineImage
As an alternative to the image XObjects described in section 8.9.5 - Image Dictionaries,...
Definition c_content_object.h:61
@ ContentInstructionType_Object
A sequence of content instructions grouped within a single object.
Definition c_content_instruction.h:56
const error_type VANILLAPDF_ERROR_SUCCESS
Indicates that the operation completed successfully.
const NameObjectHandle * NameConstant_Type
Usually represents a dictionary type entry.
ObjectType
Derived types of ObjectHandle.
Definition c_object.h:29
@ ObjectType_Undefined
Undefined unitialized default value, triggers error when used.
Definition c_object.h:34
@ ObjectType_Stream
Stream object represents compressed data inside document.
Definition c_object.h:82
@ ObjectType_Integer
Integer objects represent mathematical integers.
Definition c_object.h:64
@ ObjectType_Name
A name object is an atomic symbol uniquely defined by a sequence of characters.
Definition c_object.h:70
uint32_t error_type
This is return value type of all API functions.
Definition c_types.h:25
uint64_t biguint_type
64-bit unsigned integer type
Definition c_types.h:77
uint16_t ushort_type
16-bit unsigned integer
Definition c_types.h:46
int8_t boolean_type
Boolean type supported in C.
Definition c_types.h:31
uint32_t size_type
Size type defined in standard library.
Definition c_types.h:62