ef35f - [tpcode]# # Script to download Gmail chat log history in HTML format # (to avoid...

[tpcode]#
# Script to download Gmail chat log history in HTML format 
# (to avoid problems with the eml download that was out of order)
#
# Usage: python download_chat.py LABEL
#
# Before running this script:
#
# 1) Make sure you have libgmail installed. This script needs to find the libgmail. 
# The easiest way of making sure of that is running this script from the same 
# directory where libgmail is. You can also add libgmail to your path. 
#
# 2) Open Firefox 3, log in to your account checking the "Stay signed in"
# checkbox in the login page;
#
# 3) Change the required information below.
#

import os
import time
import libgmail # http://libgmail.sourceforge.net/
import urllib2
import os.path
from urllib2 import urlopen, Request
import cookielib
import random
import sys

################################
#
# Set your information here:
#
################################

my_username = '[email protected]'     # <--- CHANGE THIS! Add YOUR Gmail username
my_password = 'secret'           # <--- CHANGE THIS! Add YOUR Gmail password
output_dir = '/home/you/chatlogs'    # <--- CHANGE THIS! Inform the base directory where the chats will be saved
cookie_file = '/home/you/.mozilla/firefox/randomstring.default/cookies.sqlite'    # <--- CHANGE THIS! Inform the location of the Firefox 3 cookies sqlite database 


try:
    label_to_retrieve = sys.argv[1]
except:
    print 'Please inform the label you want to fetch: python download_chat.py LABEL'
    sys.exit()

random_1 = 10        # You can define the max range for each of the sleep times 
random_2 = 15 
random_3 = 15 
random_4 = 20 


# Function to get the cookiejar from Firefox 3
def sqlite2cookie(filename):
    from cStringIO import StringIO
    from pysqlite2 import dbapi2 as sqlite
 
    con = sqlite.connect(filename)
 
    cur = con.cursor()
    cur.execute("select host, path, isSecure, expiry, name, value from moz_cookies")
 
    ftstr = ["FALSE","TRUE"]
 
    s = StringIO()
    s.write("""\
# Netscape HTTP Cookie File
# http://www.netscape.com/newsref/std/cookie_spec.html
# This is a generated file!  Do not edit.
""")
    for item in cur.fetchall():
        s.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
            item[0], ftstr[item[0].startswith('.')], item[1],
            ftstr[item[2]], item[3], item[4], item[5]))
 
    s.seek(0)
 
    cookie_jar = cookielib.MozillaCookieJar()
    cookie_jar._really_load(s, '', True, True)
    return cookie_jar
    
# Based on Collin Anderson's chat history downloader: 
# http://collincode.wordpress.com/2009/07/20/google-chat-history-downloader/    
def thread_search(ga, searchType, **kwargs):
    index = 0
    while (index == 0) or index < threadListSummary[libgmail.TS_TOTAL]:
            threadsInfo = []
            items = ga._parseSearchResult(searchType, index, **kwargs)
            try:
                threads = items[libgmail.D_THREAD]
            except KeyError:
                break
            else:
                for th in threads:
                    if not type(th[0]) is libgmail.types.ListType:
                        th = [th]
                    threadsInfo.append(th)
                threadListSummary = items[libgmail.D_THREADLIST_SUMMARY][0]
                threadsPerPage = threadListSummary[libgmail.TS_NUM]
                index += threadsPerPage
            yield libgmail.GmailSearchResult(ga, (searchType, kwargs), threadsInfo)
    
##############################################################
#
# Start of the script
#
###############################################################

cj = sqlite2cookie(cookie_file) # Get the cookiejar

if cj is not None:
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

# Login to Gmail
ga = libgmail.GmailAccount(my_username, my_password)
ga.login()

# Set the dir where the chat logs will be saved, divided by label
q_label = 'label:%s' % label_to_retrieve
label_output_dir = output_dir + '/' + label_to_retrieve

if not os.path.exists(label_output_dir):
    os.makedirs(label_output_dir)

# Also based in Collin's script: make the query, search for the label with the chat logs
for page in thread_search(ga, "query", q=q_label):
    print "New Page"
    sleepme = random.randint(1, random_1)
    print 'sleeping %s seconds...' % sleepme
    time.sleep(sleepme)  
    for thread in page:
            
        # I didn't see any real reason for this block, as it was preventing the script from downloading
        # all the messages in the thread, so I commented it out
             
        '''
        if thread.info[0] == thread.info[10]:         
            # Common case: Chats that only span one message
            filename = "%s/%s_%s.html" % (label_output_dir,thread.id, thread.id)
            #only download the message if we don't have it already
            if os.path.exists(filename):
                print "already have %s" % filename
                continue
            print "Downloading raw message: %s" % filename,
            
            # Now here, instead of downloading a message...
            #message = ga.getRawMessage(thread.id).decode('utf-8').lstrip()
            
            # ... we will use the thread id and download the HTML file, as suggested
            # by David Tolnay in http://collincode.wordpress.com/2009/07/20/google-chat-history-downloader/#comment-53
            
            send_url = 'http://mail.google.com/mail/?ui=1&view=lg&msg=%s' % thread.id
            send_data = None  # or, for POST instead of GET, txdata=urrlib.urlencode(somedict)
            send_headers =  {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
            req = Request(send_url, send_data, send_headers)  # create a request object
            response = urlopen(req)
            message = response.read()
            print "done."
            file(filename, 'wb').write(message)
            sleepme = random.randint(1, random_2)
            print 'sleeping %s seconds...' % sleepme
            time.sleep(sleepme)  
            continue
        '''
        
        # Less common case: A thread that has multiple messages
        print "Looking up messages in thread %s" % thread.id
        sleepme = random.randint(1, random_3)
        print 'sleeping %s seconds...' % sleepme
        time.sleep(sleepme)  
        
        for message in thread:
            filename = "%s/%s_%s.html" % (label_output_dir,thread.id, message.id)
            print filename
            #only download the message if we don't have it already
            if os.path.exists(filename):
                print "already have %s" % filename
                continue
            print "Downloading raw message: %s" % filename,
            send_url = 'http://mail.google.com/mail/?ui=1&view=lg&msg=%s' % message.id
            send_data = None  # or, for POST instead of GET, txdata=urrlib.urlencode(somedict)
            send_headers =  {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
            req = Request(send_url, send_data, send_headers)  # create a request object
            response = urlopen(req)
            message = response.read()
            file(filename, 'wb').write(message)
            print "done."
            sleepme = random.randint(1, random_4)
            print 'sleeping %s seconds...' % sleepme
            time.sleep(sleepme)  
[/tpcode]