#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# dctrl2xml - Debian control data to XML converter
# Copyright © 2005-2010 by Frank S. Thomas <fst@debian.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import bz2, gzip, zipfile
import optparse
import re
import sys
import time
import types

from xml.etree.cElementTree import Element, dump

from debian import deb822

__author__ = 'Frank S. Thomas'
__version__ = '0.17'


class DebianControlParser:

    def __init__(self):
        self.pkg = {}
        self.contacts = ['changed-by', 'maintainer', 'uploaders',
            'xsbc-original-maintainer', 'original-maintainer']
        self.relations = ['breaks', 'build-conflicts', 'build-conflicts-indep',
            'build-depends', 'build-depends-indep', 'conflicts', 'depends',
            'enhances', 'pre-depends', 'provides', 'recommends', 'replaces',
            'suggests']


    def parse_package_string(self, pkg_str):
        self.pkg = deb822.Deb822(pkg_str)

        pkg = {}
        for orig_field in self.pkg:
            field = orig_field.lower()

            if field in self.contacts:
                pkg[field] = self.parse_contacts(self.pkg[field])

            elif field in self.relations:
                pkg[field] = self.parse_relations(self.pkg[field])

            elif field == 'package':
                pkg['name'] = self.pkg[field]

            elif field == 'date':
                pkg[field] = self.pkg[field]
                pkg['date_iso8601'] = self.parse_date(self.pkg[field].strip())

            elif field in ['files', 'checksums-sha1', 'checksums-sha256']:
                pkg[field] = self.parse_filelist(self.pkg[field])

            elif field == 'conffiles':
                pkg[field] = self.parse_conffilelist(self.pkg[field])

            elif field == 'description':
                m_newline = re.search(r'\n', self.pkg[field])
                if m_newline:
                    synopsis = self.pkg[field][:m_newline.start()].strip()
                    extended = self.pkg[field][m_newline.end():]

                    if len(synopsis) == 0:
                        pkg[field] = extended
                    else:
                        pkg[field] = synopsis
                        pkg['long-description'] = extended
                else:
                    pkg[field] = self.pkg[field]

            elif field == 'tag':
                pkg[field] = self.parse_debtags(self.pkg[field])

            elif field == 'closes':
                pkg[field] = self.parse_xsv(self.pkg[field], ' ', 'bug')

            elif field == 'python-version':
                pkg[field] = self.parse_xsv(self.pkg[field], ',', 'version')

            else:
                pkg[field] = self.pkg[field]

        self.pkg = pkg
        return self.pkg


    def parse_contacts(self, contacts_str):
        """Parse data of the Maintainer, Uploaders, and Changed-By fields."""

        split_re = r'\s*((.+?)\s+?<(.+?)>,?|(.+?)\s+?\((.+?)\),?)'
        s_cons = re.split(split_re, contacts_str)

        if len(s_cons) < 6:
            self._err('E: could not parse contact: ' + contacts_str)
            return contacts_str

        contacts = []

        for contact in [s_cons[x:x+6] for x in range(0, len(s_cons), 6)]:
            if len(contact) != 6:
                continue
            elif contact[2] and contact[3]:
                c_dict = {'name': contact[2], 'email': contact[3]}
            elif contact[5] and contact[4]:
                c_dict = {'name': contact[5], 'email': contact[4]}

            contacts.append({'contact': c_dict})

        return contacts


    def parse_filelist(self, files_str):
        """Parse lines of different file types in the File field."""
        files = {}
        files_re = {'deb': r'.*\.deb',
            'dsc': r'.*\.dsc',
            'diff': r'.*\.diff\.gz',
            'tarball': r'.*\.tar\.(gz|bz2|lzma)'}

        for line in files_str.split('\n'):
            m_file = re.match(r'^\s*([\da-f]{32}.*)', line)
            if m_file:
                fields = m_file.group(1).split()
                for ftype, file_re in files_re.iteritems():
                    if re.search(file_re, fields[-1]):
                        files[ftype] = {'checksum': fields[0],
                                        'size': fields[1]}

                        if len(fields) == 3:
                            files[ftype]['filename'] = fields[2]
                        elif len(fields) == 5:
                            files[ftype]['section'] = fields[2]
                            files[ftype]['priority'] = fields[3]
                            files[ftype]['filename'] = fields[4]

            elif line.strip():
                self._err('E: could not parse File line: ' + line)

        return files


    def parse_conffilelist(self, conffiles_str):
        """Parse the list of conffiles in the Conffiles field."""

        conffiles = []

        for line in conffiles_str.split('\n'):
            m_cfile = re.match(r'^\s*(.+)\s+([\da-f]{32})', line)
            if m_cfile:
                cfile = {'conffile': {'name': m_cfile.group(1),
                                      'md5sum': m_cfile.group(2)}}
                conffiles.append(cfile)
            elif line.strip():
                self._err('E: could not parse Conffiles line: ' + line)

        return conffiles


    def parse_relations(self, relations_str):
        relations_list = []

        for relation in relations_str.split(','):
            if re.search('\|', relation):
                alts = relation.split('|')
                alts_list = [self.parse_relation(alt) for alt in alts]
                relations_list.append({'alternative': alts_list})
            else:
                relations_list.append(self.parse_relation(relation))

        return relations_list


    def parse_relation(self, relation_str):
        relation = {}
        relation_re = r'([\w+-.]*)\s*(\(\s*([<>=]{1,2})\s*(.*)\s*\))?' \
                      '\s*(\[(.*)\])?'

        m_rel = re.match(relation_re, relation_str.strip())
        if m_rel:
            if m_rel.group(1):
                relation['name'] = m_rel.group(1)
            if m_rel.group(2):
                relation['relation'] = m_rel.group(3)
                relation['version'] = m_rel.group(4)

            if m_rel.group(6):
                archs = m_rel.group(6).split()
                arch_list = []
                not_arch_list = []

                for arch in archs:
                    if arch[0] == '!':
                        arch_list.append({'name': arch})
                    else:
                        not_arch_list.append({'name': arch})

                if len(arch_list) != 0:
                    relation['arch'] = arch_list
                if len(not_arch_list) != 0:
                    relation['arch'] = not_arch_list

        return {'package': relation}


    def parse_debtags(self, debtags_str):
        """Parse the field data of the Tag field (the package's Debtags)."""

        debtags = {}

        for tag_str in debtags_str.split(', '):
            tag_splitted = tag_str.strip().split('::', 1)
            if len(tag_splitted) != 2:
                self._err('W: corrupted tag detected: ' + str(tag_splitted))
                continue
            else:
                (facet, tag) = tag_splitted

            # Replace some characters from tags, otherwise the resulting XML
            # would not be valid.
            tag = tag.replace(':', '-').replace('+', 'p')
            tag = re.sub(r'(^|\W)(\d)', '\g<1>_\g<2>', tag)

            if not debtags.has_key(facet):
                debtags[facet] = []

            # Handle compressed tags, such as "use::{configuring,monitor}".
            c_tags = re.match(r'{(.*)}', tag)
            if c_tags:
                for c_tag in c_tags.group(1).split(','):
                    debtags[facet].append({c_tag: ''})
            else:
                debtags[facet].append({tag: ''})

        return debtags


    def parse_date(self, rfc2822_date_str):
        """Convert a RFC 2822 date to its representation in ISO 8601."""

        date_str = rfc2822_date_str[:25]
        if rfc2822_date_str.endswith('UTC'):
            zone_str = 'Z'
        else:
            zone_str = rfc2822_date_str[-5:]

        date = time.strptime(date_str, '%a, %d %b %Y %H:%M:%S')
        iso8601_date_str = time.strftime('%Y-%m-%dT%H:%M:%S', date) + zone_str

        return iso8601_date_str


    def parse_xsv(self, xsv_str, sep, element):
        """Parse a list of X separated values (XSV)."""

        return [{element: v.strip()} for v in xsv_str.split(sep)]


    def _err(self, error_msg):
        """Print an error message to standard error."""

        print >> sys.stderr, error_msg
        return


class DebianControl2XMLConverter:

    def __init__(self):
        pass

    def create_xml_tree(self, pkg):
        node = Element('package')
        for name, value in pkg.iteritems():
            node.append(self.create_node(name, value))

        return node


    def create_node(self, name, value):
        node = Element(name)

        if isinstance(value, types.StringTypes):
            node.text = value

        elif isinstance(value, dict):
            for new_name in value:
                child = self.create_node(new_name, value[new_name])
                node.append(child)

        elif isinstance(value, list):
            for item in value:
                for new_name, new_value in item.iteritems():
                    child = self.create_node(new_name, new_value)
                    node.append(child)

        return node


def main():
    opts = parse_options()
    file_obj = get_file_obj(opts.filename)

    parser = DebianControlParser()
    converter = DebianControl2XMLConverter()
    print_xml = lambda data: dump(converter.create_xml_tree(
        parser.parse_package_string(data)))

    print '<?xml version="1.0" encoding="UTF-8"?>'
    print '<packages generator="dctrl2xml/' + __version__ + '">'

    data = ''
    for line in file_obj.readlines():
        data += line
        if line.isspace():
            if data.strip():
                print_xml(data)
            data = ''

    # Do not discard the last (or the only) control entry
    # if data input does not end with an empty line.
    if data.strip():
        print_xml(data)
 
    print '</packages>'
    return


def get_file_obj(filename):
    """Return an appropriate file object for filename."""

    if filename == '':
        file_obj = sys.stdin
    else:
        extension = filename.split('.')[-1]
        if extension == 'gz':
            file_obj = gzip.GzipFile
        elif extension == 'bz2':
            file_obj = bz2.BZ2File
        elif zipfile.is_zipfile(filename):
            file_obj = zipfile.ZipFile
        else:
            file_obj = file

        try:
            file_obj = file_obj(filename, 'r')
        except IOError, error:
            print >> sys.stderr, error.__str__()
            sys.exit(error.args[0])

    return file_obj


def parse_options():
    """Parse dctrl2xml's command line options."""

    parser = optparse.OptionParser(version="%prog " + __version__)
    parser.add_option('-f', '--file',
        dest='filename', default='', metavar='FILE',
        help='read control data from file FILE instead of stdin')

    opts = parser.parse_args()[0]
    return opts


if __name__ == '__main__':
    main()
