# -*- coding: utf-
8
-*-
import
time
import
re
import
os
import
string
import
sys
import
math
''
' ------------------------------------------------------- '
''
#統(tǒng)計關鍵詞及個數(shù)
def CountKey(fileName, resultName):
try
:
#計算文件行數(shù)
lineNums = len(open(fileName,
'rU'
).readlines())
print u
'文件行數(shù): '
+ str(lineNums)
#統(tǒng)計格式 格式<key:value> <屬性:出現(xiàn)個數(shù)>
i =
0
table = {}
source = open(fileName,r)
result = open(resultName,w)
while
i < lineNums:
line = source.readline()
line = line.rstrip('
')
print line
words = line.split( ) #空格分隔
print str(words).decode(
'string_escape'
) #list顯示中文
#字典插入與賦值
for
word in words:
if
word!= and table.has_key(word): #如果存在次數(shù)加
1
num = table[word]
table[word] = num +
1
elif word!=: #否則初值為
1
table[word] =
1
i = i +
1
#鍵值從大到小排序 函數(shù)原型:sorted(dic,value,reverse)
dic = sorted(table.iteritems(), key = lambda asd:asd[
1
], reverse = True)
for
i in range(len(dic)):
#print
'key=%s, value=%s'
% (dic[i][
0
],dic[i][
1
])
result.write(<+dic[i][
0
]+:+str(dic[i][
1
])+>
)
return
dic
except Exception,e:
print
'Error:'
,e
finally
:
source.close()
result.close()
print 'END
'
''
' ------------------------------------------------------- '
''
#統(tǒng)計關鍵詞及個數(shù) 并計算相似度
def MergeKeys(dic1,dic2):
#合并關鍵詞 采用三個數(shù)組實現(xiàn)
arrayKey = []
for
i in range(len(dic1)):
arrayKey.append(dic1[i][
0
]) #向數(shù)組中添加元素
for
i in range(len(dic2)):
if
dic2[i][
0
] in arrayKey:
print
'has_key'
,dic2[i][
0
]
else
: #合并
arrayKey.append(dic2[i][
0
])
else
:
print '
'
test = str(arrayKey).decode(
'string_escape'
) #字符轉換
print test
#計算詞頻 infobox可忽略TF-IDF
arrayNum1 = [
0
]*len(arrayKey)
arrayNum2 = [
0
]*len(arrayKey)
#賦值arrayNum1
for
i in range(len(dic1)):
key = dic1[i][
0
]
value = dic1[i][
1
]
j =
0
while
j < len(arrayKey):
if
key == arrayKey[j]:
arrayNum1[j] = value
break
else
:
j = j +
1
#賦值arrayNum2
for
i in range(len(dic2)):
key = dic2[i][
0
]
value = dic2[i][
1
]
j =
0
while
j < len(arrayKey):
if
key == arrayKey[j]:
arrayNum2[j] = value
break
else
:
j = j +
1
print arrayNum1
print arrayNum2
print len(arrayNum1),len(arrayNum2),len(arrayKey)
#計算兩個向量的點積
x =
0
i =
0
while
i < len(arrayKey):
x = x + arrayNum1[i] * arrayNum2[i]
i = i +
1
print x
#計算兩個向量的模
i =
0
sq1 =
0
while
i < len(arrayKey):
sq1 = sq1 + arrayNum1[i] * arrayNum1[i] #pow(a,
2
)
i = i +
1
print sq1
i =
0
sq2 =
0
while
i < len(arrayKey):
sq2 = sq2 + arrayNum2[i] * arrayNum2[i]
i = i +
1
print sq2
result =
float
(x) / ( math.sqrt(sq1) * math.sqrt(sq2) )
return
result
''
' -------------------------------------------------------
基本步驟:
1
.分別統(tǒng)計兩個文檔的關鍵詞
2
.兩篇文章的關鍵詞合并成一個集合,相同的合并,不同的添加
3
.計算每篇文章對于這個集合的詞的詞頻 TF-IDF算法計算權重
4
.生成兩篇文章各自的詞頻向量
5
.計算兩個向量的余弦相似度,值越大表示越相似
-------------------------------------------------------
''
'
#主函數(shù)
def main():
#計算文檔
1
-百度的關鍵詞及個數(shù)
fileName1 = BaiduSpider.txt
resultName1 = Result_Key_BD.txt
dic1 = CountKey(fileName1, resultName1)
#計算文檔
2
-互動的關鍵詞及個數(shù)
fileName2 = HudongSpider\
001
.txt
resultName2 = HudongSpider\Result_Key_001.txt
dic2 = CountKey(fileName2, resultName2)
#合并兩篇文章的關鍵詞及相似度計算
result = MergeKeys(dic1, dic2)
print result
if
__name__ ==
'__main__'
:
main()</key:value>