Bring elegant by indirection: 10/01/2008

this post is obviously out of date, so now you should try https://code.google.com/p/pyv8/
这个帖子已经明显过时了，你应该试试https://code.google.com/p/pyv8/

-------------------------
以前因为写了这段代码, 刷了一下google adsense的链接, 结果google不让我用adsense了.
因为google adsense中javascript代码应该是通过混淆的,当时分析没有找到头绪,就找到了spidermonkey.
这段代码主要是在python中通过spidermonkey类库, 动态执行混淆后的javascript,找到真正链接.

a.py

from spidermonkey import Runtime
import time, urllib2, httplib, random, re
import urllister

def get():
    httplib.HTTPConnection.debuglevel = 1

baseurl = 'http://pagead2.googlesyndication.com'
dt = str(int(time.time() * 1000))
print 'start sleep...'
time.sleep(random.randint(0, 9))
print dt, 'end sleep...'
url = 'http://pagead2.googlesyndication.com/pagead/ads?client=ca-pub-6581680843370427&dt=' + dt + '&lmt=1156579898&format=468x60_as&output=html&url=http%3A%2F%2Flocalhost%2Ftest.html&color_bg=F6F6F6&color_text=9E5205&color_link=B8A80D&color_url=B8A80D&color_border=9E5205&ad_type=text&cc=100&u_h=800&u_w=1280&u_ah=779&u_aw=1280&u_cd=24&u_tz=480&u_his=1&u_java=true&u_nplug=9&u_nmime=91'

request = urllib2.Request(url)
request.add_header('USer-Agent',
                   'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.5) Gecko/20060731 Ubuntu/dapper-security Firefox/1.5.0.5')
opener = urllib2.build_opener()
conn = opener.open(request)
html = conn.read()
conn.close()

print html
regex = re.compile(r'function jcc\(a\).*function ha\(a\).*?jcc\(a\).*?\}')
m = regex.search(html)
if m is not None:
    func = m.group(0)
func = re.sub(r'pha=document\.getElementById\(a\)', 'phahref=a', func)
print func
func = re.sub(r'pha\.href', 'phahref', func)
print func
func = re.sub(r"a=='aw0'", 'a == a', func)
print func
else:
    pass

parser = urllister.URLLister()

parser.feed(html)
parser.close()

href = baseurl + parser.href[0]

javascript = func + ' href="' + href + '"; ha(href); print (href);'
print javascript
rt = Runtime()
cx = rt.new_context()
f = cx.eval_script(javascript)
s = cx.get_global("phahref");
print s

request = urllib2.Request(s)
request.add_header('USer-Agent',
                   'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.5) Gecko/20060731 Ubuntu/dapper-security Firefox/1.5.0.5')
opener = urllib2.build_opener()
conn = opener.open(request)
html = conn.read()
conn.close()

if __name__ == '__main__':
    import thread
    import analy

    for i in range(20):
        print get
        thread.start_new_thread(analy.get, ())
        print i

-->

参考网址:

http://wwwsearch.sourceforge.net/python-spidermonkey/
http://pypi.python.org/pypi/python-spidermonkey/0.0.1a

2008-10-12

python 中动态执行 JavaScript代码