#!/usr/bin/python
#
# See the Usage-string below. 
# This small script is licensed under the modified (OSI-compilant) BSD license
#
# Version 0.1 (06-05-2008)
#

"""
Copyright (c) 2008, Alex de Landgraaf, Aperte
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of Alex de Landgraaf, Aperte nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""

import sys, re

if len(sys.argv) != 3:
    print("Usage: " + sys.argv[0] + " input.pdf output.pdf\n\npdf_fontfilter is an attempt to filter out multiple instances of the same font\nfrom a PDF document, which in certain cases will reduce the size of your\n(generated) PDF files considerably. It doesn't modify the input.pdf document.\n\nUse of this tool is at your own risk.\nIt has been hacked together by Alex de Landgraaf, who can be reached via\nhttp://alextreme.org")
    sys.exit(1)

file = sys.argv[1]

f = open(file)

def remove_obj(data, object_id, generation_nr):
    """
    Remove the object specified by the object_id and generation_nr
    from the PDF-formatted list of newlines in data

    Does a single pass over the data.
    """

    new_data = []
    found = False
    re_obj = re.compile(r'^' + str(object_id) + ' ' + str(generation_nr) + ' obj')
    re_endobj = re.compile(r'^endobj')

    for line in data:
        if re_obj.match(line) != None:
            # We found the object specified. Leave out all lines including
            # the current one
            found = True
        elif found == True and re_endobj.match(line) != None:
            # Last line of this object
            found = False
        else:
            if found == False:
                new_data += [line]

    return new_data

def get_font(font_refs, font_ref):
    """
    Return the font as specified by font_ref or False if it doesn't exist
    """
    
    for font in font_refs:
        if font[0] == font_ref[0]:
            return font
    return False

def get_object_byte_offset(data, object_nr, generation_nr, xref = False):
    """
    Calculate the offset of the given object from the start of the 
    PDF document, specified by data as a list of lines
    """
    if xref: # Find the start of the cross-reference section instead
        re_object_start = re.compile(r'^endobj xref')
    else:
        re_object_start = re.compile(r'^' + str(object_nr) + ' ' + str(generation_nr) + ' obj')
    offset = 0
    for line in data:
        if re_object_start.match(line) != None:
            break
        else:
            offset += len(line)
    if xref:
        offset += len('endobj ') # Yes, PDF is this obtuse
    return offset

def get_xref_section(data):
    """
    Get the xref section that is at the end of every PDF document,
    including the header. Note that we currently only support 
    single-xref sections, but the PDF document format does allow multiple
    sections.
    data is a list of lines containing the PDF document.
    """

    data.reverse() # The cross-reference section is near the end
    xref = []
    at_xref = False
    re_xref_start = re.compile(r'^endobj xref')
    re_trailer = re.compile(r'^trailer')
    for line in data:        
        if re_xref_start.match(line) != None:
            at_xref = False
            break
        elif re_trailer.match(line) != None:
            at_xref = True
        else:
            if at_xref:
                xref += [line]
    xref.reverse()
    data.reverse()
    return xref

def flag_xref_free(xref_section, deleted):
    """
    For all xref-lines in xref_section(skipping the first),
    we flag all objects as deleted which are definied in the
    'deleted' tuple-list (object_id, generation_nr).

    We use an alternative but allowed method by avoiding the linked-list
    structure normally used for freed objects. Instead, each freed object
    points back to object_id 0 and has a special generation nr. 
    This is in accordance with the PDF 1.7 reference manual.
    """

    for xref_nr in enumerate(xref_section[1:]): # Skip xref-header
        for d in deleted:
            if xref_nr[0] == int(d[0]): # object nr's match, free this object
                # Update the xref_section, but +1 for the xref-header
                xref_section[1+xref_nr[0]] = "0000000000 65535 f \n"
    return xref_section

def update_xref_offsets(data, xref_section):
    """
    For all xref-lines in xref_section (skipping the first),
    we re-calculate the byte offsets of the objects in the PDF file.
    This is required as multiple objects may have been removed,
    which leads to the offsets being incorrect for accessing the objects
    """

    re_xref_object = re.compile(r'^(\d+) (\d+) n')
    for xref_nr in enumerate(xref_section[1:]): # Don't forget the xref-header
        match = re_xref_object.match(xref_section[1+xref_nr[0]]) 
        if match != None:
            # We've found an occupied xref-object
            # Replace the offset with a newly-calculated one

            # group(1) is the old byte-offset we are to replace
            object_id = int(xref_nr[0])
            generation_nr = int(match.group(2))
            offset = get_object_byte_offset(data, object_id, generation_nr)

            # Update the xref_section, but +1 for the xref-header
            xref_section[1+xref_nr[0]] = '%(offset)010d %(generation_nr)05d n \n' % { 'offset': int(offset), 'generation_nr': int(generation_nr)}
    return xref_section

def replace_xref_section(data, xref_section):
    """
    Given a new xref_section, replace the existing one in data,
    which is a list of lines containing the PDF document
    """

    re_xref_start = re.compile(r'^endobj xref')
    for line_nr in enumerate(data):
        if re_xref_start.match(data[line_nr[0]]) != None:
            data = data[:line_nr[0]+1] + xref_section + data[line_nr[0] + len(xref_section) + 1:]
            break
    return data

def update_startxref_offset(data):
    """
    We also need to update the second-to-last line of data,
    which contains the offset to the start of the cross-reference section.
    (actually, to the xref keyword, not the start of the line 'endobj xref')
    """
    
    offset = get_object_byte_offset(data, 0, 0, True) # Get the xref offset
    data[-2] = str(offset) + "\n"
    return data

def replace_reference(data, old_obj_id, old_ref_nr, new_obj_id, new_ref_nr):
    """
    Replace any occurrance of a FontFile2 reference to the old object
    with a FontFile2 referring to the new object
    """

    new_data = []
    re_fontfile = re.compile(r'^/FontFile2 ' + str(old_obj_id) + ' ' + str(old_ref_nr) + ' R')
    for line in data:
        match = re_fontfile.match(line)
        if match != None:
            line = "/FontFile2 " + str(new_obj_id) + " " + str(new_ref_nr) + " R\n"

        new_data += [line]
    return new_data

def filter_fonts(data):
    """
    Given a list of lines containing a PDF document, replace all
    duplicate font objects by a reference to the first font object found.
    Also update the cross-reference section using the list of deleted objects,
    and recalculate the byte-offsets of all other objects in this document.
    """

    font_refs = []
    deleted = []
    finished = False

    re_font = re.compile(r'^/FontName /(.*)')
    re_fontfile = re.compile(r'^/FontFile2 (\d+) (\d+) R')
    re_endobj = re.compile(r'^endobj')

    while finished != True:
        curr_font = ""
        object_id = -1
        generation_nr = -1

        finished = True
        for line in data:

            # check if we're entering a font-object
            match = re_font.match(line)
            if match:
                curr_font = match.group(1)

            # check if we've found a reference to a fontfile-stream object
            match = re_fontfile.match(line)
            if curr_font != "" and match:
                object_id = match.group(1)
                generation_nr = match.group(2)

                font_ref = (curr_font, object_id, generation_nr,)

                font = get_font(font_refs, font_ref)
                if font == False: # Doesn't exist, add it
                    font_refs += [font_ref]
                else: 
                    # Check if the current font isn't just a reference to the first font
                    if font[1] == object_id and font[2] == generation_nr:
                        pass
                    else:
                        # Exists, remove the object from the PDF and replace this reference with the font reference returned
                        
                        data = remove_obj(data, font_ref[1], font_ref[2])
                        data = replace_reference(data, font_ref[1], font_ref[2], font[1], font[2])
                        deleted += [(font_ref[1], font_ref[2])]
                        
                        # Re-process the data without the removed object
                        finished = False
                        break
                
                
            # check if we're exiting a font-object
            match = re_endobj.match(line)
            if match:
                curr_font = ""
                object_id = -1
                generation_nr = -1

    # Once we are done with removing the fonts, one thing remains:
    # Updating the Cross-reference table (at the end of the PDF)
    # The list of deleted objects is necessary for this

    xref_section = get_xref_section(data)
    xref_section = update_xref_offsets(data, xref_section)
    xref_section = flag_xref_free(xref_section, deleted)
    data = replace_xref_section(data, xref_section)
    data = update_startxref_offset(data)
    return data

data = f.readlines()
f.close()

data = filter_fonts(data)

f = open(sys.argv[2], 'w')
f.writelines(data)
f.close()

