【www.gdgbn.com--apache】

python apache log 获取百度关键词搜索来源
#!/usr/bin/python
# -#- coding: utf-8 -*-

import os, base64, re, fnmatch, imghdr, shutil, pprint, urlparse

log = "seo教程.log"
reader = open(log, "r")
config =  {"s0":{"h":"www.google.com.hk","q":"q"},"s1":{"h":"www.baidu.com","q":"wd|word"},"s3":{"h":"www.soso.com","q":"w"}}
def get_q(x):
    for i,j in config.items():
        str_q = j["q"].split("|")
        if x.netloc == j["h"]:
            return str_q

for line in reader.xreadlines() :
    p = re.compile(".*"get (/seo/t.php教程?.*) http/1.1".*", re.ignorecase)
    m = p.match(line)
    if m :
        s_t = m.group(1)
        s_t_u = urlparse.urlparse(s_t)
       
        s_t_u_qs = urlparse.parse_qs(s_t_u.query,true)
        #print s_t_u_qs["ref"][0]
        ref = urlparse.urlparse(str(s_t_u_qs["ref"][0]))
       
        ref_qs = urlparse.parse_qs(ref.query,true)
        #print ref
        #print get_q(ref)
        for k in get_q(ref):
            if k in ref_qs:
                print ref.netloc+":::"+ref_qs[k][0]
reader.close()

本文来源:http://www.gdgbn.com/jiaocheng/29223/