import dask.bag as db import ujson as json import pandas as pd import numpy as np import gzip import re b=db.read_text(r'F:/kaggle_dataset/亞馬遜評論/reviews_Digital_Music_5.json.gz',encoding='utf-8').map(json.loads) b.take(1) print(sum([1 for _ in gzip.open(r'F:/kaggle_dataset/亞馬遜評論/reviews_Digital_Music_5.json.gz')])) #統(tǒng)計多少條數(shù)據(jù) tempDir = 'F:/kaggle_dataset/亞馬遜評論/制作亞馬遜用戶評論詞云' stopwords=set(pd.read_csv('C:/Users/zhangshuai_lc/stopwords_en.txt',header=None)[0]) pattern = re.compile(r'\w+') #正則 def hashFile(): temp_path_list = [] for i in range(1,101): temp_path_list.append(open(tempDir+'/'+str(i)+'.txt',mode='w')) #構(gòu)造100個文本文件路徑 for each in (gzip.open(r'F:/kaggle_dataset/亞馬遜評論/reviews_Digital_Music_5.json.gz')): sentence = eval(each) #字符串轉(zhuǎn)字典 words = sentence['reviewText'] words_list = pattern.findall(words) #print(words_list) for word in words_list: if word.lower() not in stopwords and len(word) >= 2: word = word.lower() temp_path_list[hash(word)%100].write(word+'\n') #對單詞進行hash,相同的單詞一定會hash到同一個文件中 for f in temp_path_list: f.close() hashFile()
這一步是要將亞馬遜用戶對音樂的評論從原始數(shù)據(jù)中提取出來,然后使用正則表達式將評論分詞,然后進行哈希映射將所有單詞分配到100個文本文件中存儲。相同的單詞一定會被分配到同一個文件當中。 import os from collections import Counter results = Counter() for root, dirs, files in os.walk(r'F:/kaggle_dataset/亞馬遜評論/制作亞馬遜用戶評論詞云'): for file in files: with open(os.path.join(root, file)) as f: words_list = f.readlines() words_list = list(map(lambda x: x.strip('\n'),words_list)) word_common_1000 = Counter(words_list).most_common(1000) results.update(word_common_1000)
將每個文件中出現(xiàn)頻率最高的1000個單詞存入results當中。使用堆統(tǒng)計results當中出現(xiàn)頻率最高的100單詞。 import heapq words_fren_list = list(results.keys()) words_fren_list_100 = heapq.nlargest(100,words_fren_list,key = lambda x:x[1]) len(words_fren_list_100)
|