[tpcode]# # Script to download Gmail chat log history in HTML format # (to avoid problems with the eml download that was out of order) # # Usage: python download_chat.py LABEL # # Before running this script: # # 1) Make sure you have libgmail installed. This script needs to find the libgmail. # The easiest way of making sure of that is running this script from the same # directory where libgmail is. You can also add libgmail to your path. # # 2) Open Firefox 3, log in to your account checking the "Stay signed in" # checkbox in the login page; # # 3) Change the required information below. # import os import time import libgmail # http://libgmail.sourceforge.net/ import urllib2 import os.path from urllib2 import urlopen, Request import cookielib import random import sys ################################ # # Set your information here: # ################################ my_username = '[email protected]' # <--- CHANGE THIS! Add YOUR Gmail username my_password = 'secret' # <--- CHANGE THIS! Add YOUR Gmail password output_dir = '/home/you/chatlogs' # <--- CHANGE THIS! Inform the base directory where the chats will be saved cookie_file = '/home/you/.mozilla/firefox/randomstring.default/cookies.sqlite' # <--- CHANGE THIS! Inform the location of the Firefox 3 cookies sqlite database try: label_to_retrieve = sys.argv[1] except: print 'Please inform the label you want to fetch: python download_chat.py LABEL' sys.exit() random_1 = 10 # You can define the max range for each of the sleep times random_2 = 15 random_3 = 15 random_4 = 20 # Function to get the cookiejar from Firefox 3 def sqlite2cookie(filename): from cStringIO import StringIO from pysqlite2 import dbapi2 as sqlite con = sqlite.connect(filename) cur = con.cursor() cur.execute("select host, path, isSecure, expiry, name, value from moz_cookies") ftstr = ["FALSE","TRUE"] s = StringIO() s.write("""\ # Netscape HTTP Cookie File # http://www.netscape.com/newsref/std/cookie_spec.html # This is a generated file! Do not edit. """) for item in cur.fetchall(): s.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( item[0], ftstr[item[0].startswith('.')], item[1], ftstr[item[2]], item[3], item[4], item[5])) s.seek(0) cookie_jar = cookielib.MozillaCookieJar() cookie_jar._really_load(s, '', True, True) return cookie_jar # Based on Collin Anderson's chat history downloader: # http://collincode.wordpress.com/2009/07/20/google-chat-history-downloader/ def thread_search(ga, searchType, **kwargs): index = 0 while (index == 0) or index < threadListSummary[libgmail.TS_TOTAL]: threadsInfo = [] items = ga._parseSearchResult(searchType, index, **kwargs) try: threads = items[libgmail.D_THREAD] except KeyError: break else: for th in threads: if not type(th[0]) is libgmail.types.ListType: th = [th] threadsInfo.append(th) threadListSummary = items[libgmail.D_THREADLIST_SUMMARY][0] threadsPerPage = threadListSummary[libgmail.TS_NUM] index += threadsPerPage yield libgmail.GmailSearchResult(ga, (searchType, kwargs), threadsInfo) ############################################################## # # Start of the script # ############################################################### cj = sqlite2cookie(cookie_file) # Get the cookiejar if cj is not None: opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) # Login to Gmail ga = libgmail.GmailAccount(my_username, my_password) ga.login() # Set the dir where the chat logs will be saved, divided by label q_label = 'label:%s' % label_to_retrieve label_output_dir = output_dir + '/' + label_to_retrieve if not os.path.exists(label_output_dir): os.makedirs(label_output_dir) # Also based in Collin's script: make the query, search for the label with the chat logs for page in thread_search(ga, "query", q=q_label): print "New Page" sleepme = random.randint(1, random_1) print 'sleeping %s seconds...' % sleepme time.sleep(sleepme) for thread in page: # I didn't see any real reason for this block, as it was preventing the script from downloading # all the messages in the thread, so I commented it out ''' if thread.info[0] == thread.info[10]: # Common case: Chats that only span one message filename = "%s/%s_%s.html" % (label_output_dir,thread.id, thread.id) #only download the message if we don't have it already if os.path.exists(filename): print "already have %s" % filename continue print "Downloading raw message: %s" % filename, # Now here, instead of downloading a message... #message = ga.getRawMessage(thread.id).decode('utf-8').lstrip() # ... we will use the thread id and download the HTML file, as suggested # by David Tolnay in http://collincode.wordpress.com/2009/07/20/google-chat-history-downloader/#comment-53 send_url = 'http://mail.google.com/mail/?ui=1&view=lg&msg=%s' % thread.id send_data = None # or, for POST instead of GET, txdata=urrlib.urlencode(somedict) send_headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} req = Request(send_url, send_data, send_headers) # create a request object response = urlopen(req) message = response.read() print "done." file(filename, 'wb').write(message) sleepme = random.randint(1, random_2) print 'sleeping %s seconds...' % sleepme time.sleep(sleepme) continue ''' # Less common case: A thread that has multiple messages print "Looking up messages in thread %s" % thread.id sleepme = random.randint(1, random_3) print 'sleeping %s seconds...' % sleepme time.sleep(sleepme) for message in thread: filename = "%s/%s_%s.html" % (label_output_dir,thread.id, message.id) print filename #only download the message if we don't have it already if os.path.exists(filename): print "already have %s" % filename continue print "Downloading raw message: %s" % filename, send_url = 'http://mail.google.com/mail/?ui=1&view=lg&msg=%s' % message.id send_data = None # or, for POST instead of GET, txdata=urrlib.urlencode(somedict) send_headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} req = Request(send_url, send_data, send_headers) # create a request object response = urlopen(req) message = response.read() file(filename, 'wb').write(message) print "done." sleepme = random.randint(1, random_4) print 'sleeping %s seconds...' % sleepme time.sleep(sleepme) [/tpcode]