import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()
#加载数据集
data = pd.read_table('./Seattle_Hotels.txt',encoding="latin-1",sep = ',')
data
name | address | desc | |
---|---|---|---|
0 | Hilton Garden Seattle Downtown | 1821 Boren Avenue, Seattle Washington 98101 USA | Located on the southern tip of Lake Union, the... |
1 | Sheraton Grand Seattle | 1400 6th Avenue, Seattle, Washington 98101 USA | Located in the city's vibrant core, the Sherat... |
2 | Crowne Plaza Seattle Downtown | 1113 6th Ave, Seattle, WA 98101 | Located in the heart of downtown Seattle, the ... |
3 | Kimpton Hotel Monaco Seattle | 1101 4th Ave, Seattle, WA98101 | What?s near our hotel downtown Seattle locatio... |
4 | The Westin Seattle | 1900 5th Avenue, Seattle, Washington 98101 USA | Situated amid incredible shopping and iconic a... |
... | ... | ... | ... |
147 | The Halcyon Suite Du Jour | 1125 9th Ave W, Seattle, WA 98119 | Located in Queen Anne district, The Halcyon Su... |
148 | Vermont Inn | 2721 4th Ave, Seattle, WA 98121 | Just a block from the world famous Space Needl... |
149 | Stay Alfred on Wall Street | 2515 4th Ave, Seattle, WA 98121 | Stay Alfred on Wall Street resides in the hear... |
150 | Pike's Place Lux Suites by Barsala | 2nd Ave and Stewart St, Seattle, WA 98101 | The perfect marriage of heightened convenience... |
151 | citizenM Seattle South Lake Union hotel | 201 Westlake Ave N, Seattle, WA 98109 | Yes, it's true. Every room at citizenM is the ... |
152 rows × 3 columns
data.shape
(152, 3)
data['desc'][100]
'On a budget in Seattle or looking for something different? The historic charm and "home away from home" atmosphere of The Baroness will be sure to make you feel like one of the family. Conveniently located on First Hill, we are proud to be part of the Virginia Mason Hospital campus and only minutes from Harborview Medical Center and Swedish Hospital. The Baroness Hotel is a great option for short or long term medical, patient or family stays. Whether you are visiting the area\'s world-class medical facilities or on a budget vacation, our goal is to ensure a wonderful stay. Guest Amenities: Complimentary Internet access, Two twin, one or two queen studios with mini fridge and microwave, Two twin or one queen suites with full kitchens, Laundry facilities available, Flat screen cable television with HBO, Complimentary local calls, Ice and vending machines located in the lobby, Coffee maker and hairdryers in all guestrooms, Room service available seven days a week from the Rhododendron Cafe, Limited wheelchair accessibility, Guest library and business center, Printing & fax services available, 100% non-smoking and pet free, Rooms are not air conditioned - fans are available, Self-parking available at Virginia Mason hospital for a fee.'
看一下酒店的主要描述信息
vec = CountVectorizer().fit(data['desc'])
bag_of_words = vec.transform(data['desc'])
bag_of_words.toarray()
array([[0, 1, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 1, 0, 0]], dtype=int64)
bag_of_words.shape
(152, 3200)
sum_words = bag_of_words.sum(axis =0 )
sum_words
matrix([[ 1, 11, 11, ..., 2, 6, 2]], dtype=int64)
def get_top_n_words(corpus,n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis =0 )
word_freqs = [(word,sum_words[0,idex]) for word,idex in vec.vocabulary_.items()]
word_freq = sorted(word_freqs,key = lambda x:x[1],reverse=True)
return word_freq[:n]
common_words = get_top_n_words(data['desc'],20)
common_words
[('the', 1258),
('and', 1062),
('of', 536),
('seattle', 533),
('to', 471),
('in', 449),
('our', 359),
('you', 304),
('hotel', 295),
('with', 280),
('is', 271),
('at', 231),
('from', 224),
('for', 216),
('your', 186),
('or', 161),
('center', 151),
('are', 136),
('downtown', 133),
('on', 129)]
df1 = pd.DataFrame(common_words,columns=['desc','counts'])
df1.head()
desc | counts | |
---|---|---|
0 | seattle | 533 |
1 | hotel | 295 |
2 | center | 151 |
3 | downtown | 133 |
4 | free | 123 |
df1.groupby('desc').sum()['counts'].sort_values().iplot(kind='barh',yTitle='counts',linecolor='black',title='top 20 before remove stopwords')
标签:...,酒店,相似,python,sum,words,import,Seattle,desc
From: https://www.cnblogs.com/guodong1789/p/18064537