python实现获取文件列表中每个文件出现频率最高的词汇

时间：2014-12-18 13:41:20 收藏：0 阅读：135

功能描述：

获取某个路径下的所有文件，提取出每个文件中出现频率最高的前300个字。保存在数据库当中。

前提，你需要配置好nltk

#!/usr/bin/python
#coding=utf-8
'''
function : This script will create a database named mydb then

           abstract keywords of files of privacy police.

author    : Chicho

date      : 2014/7/28

running   : python key_extract.py -d path_of_file
'''

import sys,getopt
import nltk
import MySQLdb
from nltk.corpus import PlaintextCorpusReader

corpus_root = ""

if __name__ == '__main__':

    opts,args = getopt.getopt(sys.argv[1:], "d:h","directory=help")

    #get the directory
    for op,value in opts:
        if op in ("-d", "--directory"):
            corpus_root = value
	
	
	#actually， the above method to get  a directory is a little complicated,you can
	#do like this
	'''
	the input include you path and use sys.argv to get the path 
	'''
	'''
	running : python key_extract.py you path_of_file
	corpus_root = sys.argv[1]
	'''
            
            
    # corpus_root is the directory of files of privacy policy, all of the are html files
    filelists = PlaintextCorpusReader(corpus_root, '.*')

    #get the files' list
    files = filelists.fileids()
    
    #connect the database
    conn = MySQLdb.connect(host = 'your_personal_host_ip_address', user = 'rusername', port =your_port, passwd = 'U_password')
    #get the cursor
    curs = conn.cursor()

    conn.set_character_set('utf8')
    curs.execute('set names utf8')
    curs.execute('SET CHARACTER SET utf8;')
    curs.execute('SET character_set_connection=utf8;')

    '''
    conn.text_factory=lambda x: unicode(x, 'utf8', "ignore")
    #conn.text_factory=str	
    '''	 

    # create a database named mydb
    '''
    try:
        curs.execute("create database mydb")
    except Exception,e:
        print e
    '''

    conn.select_db('mydb')

    
    try:
        for i in range(300):
            sql = "alter table filekeywords add " + "key" + str(i) + " varchar(45)"
            curs.execute(sql)
    except Exception,e:
        print e
        
    
    
    i = 0
    for privacyfile in files:
        #f = open(privacyfile,'r', encoding= 'utf-8')
        sql = "insert into filekeywords set id =" + str(i)
        curs.execute(sql)
        sql = "update filekeywords set name =" + "'" + privacyfile + "' where id= " + str(i)
        curs.execute(sql)
        # get the words in privacy policy
        wordlist = [w for w in filelists.words(privacyfile) if w.isalpha() and len(w)>2]
    
        # get the keywords
        fdist = nltk.FreqDist(wordlist)
        vol = fdist.keys()
        key_num = len(vol)
        if key_num > 300:
            key_num = 300
        for j in range(key_num):
            sql = "update filekeywords set " + "key" + str(j) + "=" + "'" + vol[j] + "' where id=" + str(i)
            curs.execute(sql)
        i = i + 1


    conn.commit()
    curs.close()
    conn.close()

转载注明出处：http://blog.csdn.net/chichoxian/article/details/42003603