Python函数统计词频,利用python进行词频统计
一个txt文档,已经用结巴分词分完词,怎么用python工具对这个分完词的文档进行计算统计词频,求脚本,非
#!/usr/bin/env python3
创新互联建站服务项目包括安平网站建设、安平网站制作、安平网页制作以及安平网络营销策划等。多年来,我们专注于互联网行业,利用自身积累的技术优势、行业经验、深度合作伙伴关系等,向广大中小型企业、政府机构等提供互联网行业的解决方案,安平网站推广取得了明显的社会效益与经济效益。目前,我们服务的客户以成都为中心已经辐射到安平省份的部分城市,未来相信会继续扩大服务区域并继续获得客户的支持与信任!
#-*- coding:utf-8 -*-
import os,random
#假设要读取文件名为aa,位于当前路径
filename='aa.txt'
dirname=os.getcwd()
f_n=os.path.join(dirname,filename)
#注释掉的程序段,用于测试脚本,它生成20行数据,每行有1-20随机个数字,每个数字随机1-20
'''
test=''
for i in range(20):
for j in range(random.randint(1,20)):
test+=str(random.randint(1,20))+' '
test+='\n'
with open(f_n,'w') as wf:
wf.write(test)
'''
with open(f_n) as f:
s=f.readlines()
#将每一行数据去掉首尾的空格和换行符,然后用空格分割,再组成一维列表
words=[]
for line in s:
words.extend(line.strip().split(' '))
#格式化要输出的每行数据,首尾各占8位,中间占18位
def geshi(a,b,c):
return alignment(str(a))+alignment(str(b),18)+alignment(str(c))+'\n'
#中英文混合对齐 ,参考 ,二楼
#汉字与字母 格式化占位 format对齐出错 对不齐 汉字对齐数字 汉字对齐字母 中文对齐英文
#alignment函数用于英汉混合对齐、汉字英文对齐、汉英对齐、中英对齐
def alignment(str1, space=8, align = 'left'):
length = len(str1.encode('gb2312'))
space = space - length if space =length else 0
if align in ['left','l','L','Left','LEFT']:
str1 = str1 + ' ' * space
elif align in ['right','r','R','Right','RIGHT']:
str1 = ' '* space +str1
elif align in ['center','c','C','Center','CENTER','centre']:
str1 = ' ' * (space //2) +str1 + ' '* (space - space // 2)
return str1
w_s=geshi('序号','词','频率')
#由(词,频率)元组构成列表,先按频率降序排序,再按词升序排序,多级排序,一组升,一组降,高级sorted
wordcount=sorted([(w,words.count(w)) for w in set(words)],key=lambda l:(-l[1],l[0]))
#要输出的数据,每一行由:序号(占8位)词(占20位)频率(占8位)+'\n'构成,序号=List.index(element)+1
for (w,c) in wordcount:
w_s+=geshi(wordcount.index((w,c))+1,w,c)
#将统计结果写入文件ar.txt中
writefile='ar.txt'
w_n=os.path.join(dirname,writefile)
with open(w_n,'w') as wf:
wf.write(w_s)
如何用python实现英文短文的双词频统计?
import re
from itertools import imap as map
from collections import Counter
def parserwords(sentence):
preword = ''
result = []
for word in re.findall('\w+', sentence.lower()):
if preword:
result.append((preword, word))
preword = word
return result
context = """
Do you hear the people sing, singing a song of angry men.
It is the music of a people, who will not be slaves again,
when the beating of your heart echoes the beating of the drums.
There is a life about to start when tomorrow comes.
"""
words = []
for sentence in map(parserwords,
re.split(r'[,.]', context.lower())):
words.extend(sentence)
prefixcounter = Counter([word[0] for word in words])
counter = Counter(words)
meter = {}
for pre, post in counter.iterkeys():
meter[(pre, post)] = 1. * counter[(pre, post)] / prefixcounter[pre]
result = sorted(meter.iteritems(),
cmp = lambda a, b: cmp(b[1], a[1]) or cmp(a[0], b[0])
)
print result[:5]
如何用python对文章中文分词并统计词频
使用结巴分词,统计频率可以使用Counter,即from collections import Counter
用Python统计词频
def statistics(astr):
# astr.replace("\n", "")
slist = list(astr.split("\t"))
alist = []
[alist.append(i) for i in slist if i not in alist]
alist[-1] = alist[-1].replace("\n", "")
return alist
if __name__ == "__main__":
code_doc = {}
with open("test_data.txt", "r", encoding='utf-8') as fs:
for ln in fs.readlines():
l = statistics(ln)
for t in l:
if t not in code_doc:
code_doc.setdefault(t, 1)
else:
code_doc[t] += 1
for keys in code_doc.keys():
print(keys + ' ' + str(code_doc[keys]))
Python词频统计问题
#下载一文到words.txt,内容为(stu ml ds ml stu stuee zkz wxj Zkz Wxj)
File = "words.txt"
number_list =[]
with open(File) as f:
for line in f:
number_list.extend( str(i) for i in line.split())
for item in set(number_list):
L=[item,number_list.index(item),number_list.count(item)]
print(L) #单词 首次出现的位置 词频
with open('Q1.txt','a') as F:
F.writelines(str(L))
如何用python和jieba分词,统计词频?
#! python3
# -*- coding: utf-8 -*-
import os, codecs
import jieba
from collections import Counter
def get_words(txt):
seg_list = jieba.cut(txt)
c = Counter()
for x in seg_list:
if len(x)1 and x != '\r\n':
c[x] += 1
print('常用词频度统计结果')
for (k,v) in c.most_common(100):
print('%s%s %s %d' % (' '*(5-len(k)), k, '*'*int(v/3), v))
if __name__ == '__main__':
with codecs.open('19d.txt', 'r', 'utf8') as f:
txt = f.read()
get_words(txt)
网站名称:Python函数统计词频,利用python进行词频统计
文章路径:http://scyanting.com/article/hcgges.html