#!/usr/bin/python2.4
# This script will export all the issues (including comments) from a googlecode
# project into an XML File.
#
# Copyright (c) 2008 Shane Mc Cormack
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from BeautifulSoup import BeautifulSoup,NavigableString
import urllib2,urllib,sys,re,robotparser,codecs

# Project name (used for urls)
project="dmdirc";
# Regex to get "start" number from next/prev links
nextRegex=re.compile(".*start=([0-9]*)&.*")
# Regex to get "name" from attachment url
attachmentName=re.compile(".*name=(.*)$")
# Regex to get updates from comments
updatesRegex=re.compile("(<b>(.*?):<\/b>(.*?)<br \/>)")

def writeln(file, string):
	#file.write(unicode((string+"\n"), "utf-8").encode("utf-8"))
	#file.write((string+"\n").encode("utf-8"))
	file.write(string+"\n")

def getSoup(url):
	# Set UA to prevent UA Blocking
	robotparser.URLopener.version = "IssueScraper/0.1"
	page = urllib2.Request(url, None, {'User-agent': "IssueScraper/0.1"})
	# Now grab the page
	html = urllib2.urlopen(page).read()
	# And Feed it to BeautifulSoup, to beautify it!
	soup = BeautifulSoup(html)
	return soup

def getList(f, start=0):
	url = "http://code.google.com/p/"+project+"/issues/list?can=1&sort=-id";
	try:
		num = int(start)
		if num > 0:
			url = url+"&start="+urllib.quote(str(start));
			print "Getting issue list (Offset: "+str(start)+")"
		else:
			print "Getting issue list"
	except:
		pass

	soup = getSoup(url)
	
	# Get the headers
	headers = [];
	heading = soup.find('tr',{'id':'headingrow'})
	for link in heading.findAll('a'):
		if link["href"] == '#':
			headers.append(link.string)

	# Now get the issues.
	issueTable = soup.find('table',{'id':'resultstable'})
	for row in issueTable.findAll('tr'):
		# Only the header row has an id
		rowInfo = [];
		if (not row.has_key("id")):
			for link in row.findAll('a'):
				rowInfo.append(link.string)
			writeln(f, "\t<issue>")
			issueID="none"
			for i in range(0, len(headers)-1):
				writeln(f, "\t\t<"+headers[i]+">"+rowInfo[i]+"</"+headers[i]+">")
				if headers[i] == "ID":
					issueID=rowInfo[i]

			writeln(f, "\t\t<Summary><![CDATA["+rowInfo[i+1]+"]]></Summary>")
			writeln(f, "\t\t<Labels>")
			for i in range(i+2, len(rowInfo)):
				writeln(f, "\t\t\t<Label>"+rowInfo[i]+"</Label>")
			writeln(f, "\t\t</Labels>")
			getComments(f, issueID);
			writeln(f, "\t</issue>")

	# See if there is more pages
	page = soup.find('div',{'class':'pagination'})
	pos = -1
	for link in page.findAll('a'):
		try:
			pos = int(nextRegex.match(link["href"]).group(1));
		except:
			pos = -1

	if pos > int(start):
		getList(f, pos)

def getComments(f, number):
	url = "http://code.google.com/p/"+project+"/issues/detail?id=";
	try:
		num = int(number)
		url = url+urllib.quote(str(number));
	except:
		return
	
	print "Getting Comments for: "+str(number)
	soup = getSoup(url)
	description = soup.find('td',{'class':'vt issuedescription'})
	date = description.find('span',{'class':'date'})["title"]
	person = description.find('a').string
	writeln(f, "\t\t<Description author=\""+person+"\" date=\""+date+"\"><![CDATA[")
	descriptionData = ''.join([e for e in description.find('pre').recursiveChildGenerator() if isinstance(e,basestring)])
	writeln(f, descriptionData.strip())
	writeln(f, "\t\t]]></Description>")

	attachments = description.find('table',{'class':'attachment'})
	if attachments is not None:
		for attachment in attachments.findAll('a'):
			if not attachment.has_key("target"):
				writeln(f, "\t\t<Attachment name=\""+attachmentName.match(attachment["href"]).group(1)+"\"><![CDATA[http://code.google.com/p/"+project+"/issues/"+attachment["href"]+"]]></Attachment>")

	for comment in soup.findAll('td',{'class':'vt issuecomment'}):
		if comment.find('pre') is not None:
			date = comment.find('span',{'class':'date'})["title"]
			person = ""
			commentNumber = ""
			for link in comment.findAll('a'):
				if not link.has_key("name"):
					if person == "":
						person = link.string
				else:
					if commentNumber == "":
						commentNumber = link.string
				
			writeln(f, "\t\t<comment number=\""+commentNumber+"\" author=\""+person+"\" date=\""+date+"\">")
			if comment.find('pre').string is not None:
				writeln(f, "\t\t\t<content><![CDATA[")
				commentData = ''.join([e for e in comment.find('pre').recursiveChildGenerator() if isinstance(e,basestring)])
				writeln(f, commentData.strip())
				writeln(f, "\t\t\t]]></content>")

			updates = comment.find('div',{'class':'updates'})
			if updates is not None:
				innerDiv = updates;
				while innerDiv.find('div') is not None:
					innerDiv = innerDiv.find('div');

				for update in updatesRegex.findall(innerDiv.renderContents()):
					writeln(f, "\t\t\t<update label=\""+update[1]+"\">"+update[2].strip()+"</update>")

			attachments = comment.find('table',{'class':'attachment'})
			if attachments is not None:
				for attachment in attachments.findAll('a'):
					if not attachment.has_key("target"):
						writeln(f, "\t\t\t<Attachment name=\""+attachmentName.match(attachment["href"]).group(1)+"\"><![CDATA[http://code.google.com/p/"+project+"/issues/"+attachment["href"]+"]]></Attachment>")
			
			writeln(f, "\t\t</comment>")

f = codecs.open("issues.xml", "w", "utf-8")
writeln(f, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
writeln(f, "<issues>")
getList(f)
#getComments(f, 292)
#print getSoup("http://code.google.com/p/"+project+"/issues/detail?id=127")
writeln(f, "</issues>")
f.close()
