import pandas as pd
Load Data
df= pd.read_csv('https://raw.githubusercontent.com/Datamanim/pandas/main/AB_NYC_2019.csv')
df.head()
|
id |
name |
host_id |
host_name |
neighbourhood_group |
neighbourhood |
latitude |
longitude |
room_type |
price |
minimum_nights |
number_of_reviews |
last_review |
reviews_per_month |
calculated_host_listings_count |
availability_365 |
0 |
2539 |
Clean & quiet apt home by the park |
2787 |
John |
Brooklyn |
Kensington |
40.64749 |
-73.97237 |
Private room |
149 |
1 |
9 |
2018-10-19 |
0.21 |
6 |
365 |
1 |
2595 |
Skylit Midtown Castle |
2845 |
Jennifer |
Manhattan |
Midtown |
40.75362 |
-73.98377 |
Entire home/apt |
225 |
1 |
45 |
2019-05-21 |
0.38 |
2 |
355 |
2 |
3647 |
THE VILLAGE OF HARLEM....NEW YORK ! |
4632 |
Elisabeth |
Manhattan |
Harlem |
40.80902 |
-73.94190 |
Private room |
150 |
3 |
0 |
NaN |
NaN |
1 |
365 |
3 |
3831 |
Cozy Entire Floor of Brownstone |
4869 |
LisaRoxanne |
Brooklyn |
Clinton Hill |
40.68514 |
-73.95976 |
Entire home/apt |
89 |
1 |
270 |
2019-07-05 |
4.64 |
1 |
194 |
4 |
5022 |
Entire Apt: Spacious Studio/Loft by central park |
7192 |
Laura |
Manhattan |
East Harlem |
40.79851 |
-73.94399 |
Entire home/apt |
80 |
10 |
9 |
2018-11-19 |
0.10 |
1 |
0 |
데이터의 각 host_name의 빈도수를 구하고 host_name으로 정렬하여 상위 5개를 출력하라
df.host_name.value_counts().sort_index().head(5)
'Cil 1
(Ari) HENRY LEE 1
(Email hidden by Airbnb) 6
(Mary) Haiy 1
-TheQueensCornerLot 1
Name: host_name, dtype: int64
데이터의 각 host_name의 빈도수를 구하고 빈도수 기준 내림차순 정렬한 데이터 프레임을 만들어라. 빈도수 컬럼은 counts로 명명하라
df_host = df.groupby('host_name').size().sort_values(ascending=False).to_frame()
df_host.columns = ['counts']
df_host
|
counts |
host_name |
|
Michael |
417 |
David |
403 |
Sonder (NYC) |
327 |
John |
294 |
Alex |
279 |
... |
... |
Jerbean |
1 |
Jerald |
1 |
Jeonghoon |
1 |
Jeny |
1 |
현선 |
1 |
11452 rows × 1 columns
neighbourhood_group의 값에 따른 neighbourhood컬럼 값의 갯수를 구하여라
df.groupby(['neighbourhood_group', 'neighbourhood'], as_index = False).size()
|
neighbourhood_group |
neighbourhood |
size |
0 |
Bronx |
Allerton |
42 |
1 |
Bronx |
Baychester |
7 |
2 |
Bronx |
Belmont |
24 |
3 |
Bronx |
Bronxdale |
19 |
4 |
Bronx |
Castle Hill |
9 |
... |
... |
... |
... |
216 |
Staten Island |
Tottenville |
7 |
217 |
Staten Island |
West Brighton |
18 |
218 |
Staten Island |
Westerleigh |
2 |
219 |
Staten Island |
Willowbrook |
1 |
220 |
Staten Island |
Woodrow |
1 |
221 rows × 3 columns
neighbourhood_group의 값에 따른 neighbourhood컬럼 값 중 neighbourhood_group그룹의 최댓값들을 출력하라
df.groupby(['neighbourhood_group', 'neighbourhood'], as_index = False).size()\
.groupby('neighbourhood_group', as_index=False).max()
|
neighbourhood_group |
neighbourhood |
size |
0 |
Bronx |
Woodlawn |
70 |
1 |
Brooklyn |
Windsor Terrace |
3920 |
2 |
Manhattan |
West Village |
2658 |
3 |
Queens |
Woodside |
900 |
4 |
Staten Island |
Woodrow |
48 |
neighbourhood_group 값에 따른 price값의 평균, 분산, 최대, 최소 값을 구하여라
df[['neighbourhood_group','price']].groupby('neighbourhood_group').agg(['mean','var','max','min'])
|
price |
|
mean |
var |
max |
min |
neighbourhood_group |
|
|
|
|
Bronx |
87.496792 |
11386.885081 |
2500 |
0 |
Brooklyn |
124.383207 |
34921.719135 |
10000 |
0 |
Manhattan |
196.875814 |
84904.159185 |
10000 |
0 |
Queens |
99.517649 |
27923.130227 |
10000 |
10 |
Staten Island |
114.812332 |
77073.088342 |
5000 |
13 |
neighbourhood_group 값에 따른 reviews_per_month 평균, 분산, 최대, 최소 값을 구하여라
df[["neighbourhood_group", "reviews_per_month"]]\
.groupby('neighbourhood_group')\
.agg(['mean', 'std', 'max', 'min'])
|
reviews_per_month |
|
mean |
std |
max |
min |
neighbourhood_group |
|
|
|
|
Bronx |
1.837831 |
1.673284 |
10.34 |
0.02 |
Brooklyn |
1.283212 |
1.516259 |
14.00 |
0.01 |
Manhattan |
1.272131 |
1.628252 |
58.50 |
0.01 |
Queens |
1.941200 |
2.213108 |
20.94 |
0.01 |
Staten Island |
1.872580 |
1.685495 |
10.12 |
0.02 |
neighbourhood 값과 neighbourhood_group 값에 따른 price 의 평균을 구하라
df.groupby(['neighbourhood', 'neighbourhood_group'], as_index = False)['price'].mean()
|
neighbourhood |
neighbourhood_group |
price |
0 |
Allerton |
Bronx |
87.595238 |
1 |
Arden Heights |
Staten Island |
67.250000 |
2 |
Arrochar |
Staten Island |
115.000000 |
3 |
Arverne |
Queens |
171.779221 |
4 |
Astoria |
Queens |
117.187778 |
... |
... |
... |
... |
216 |
Windsor Terrace |
Brooklyn |
138.993631 |
217 |
Woodhaven |
Queens |
67.170455 |
218 |
Woodlawn |
Bronx |
60.090909 |
219 |
Woodrow |
Staten Island |
700.000000 |
220 |
Woodside |
Queens |
85.097872 |
221 rows × 3 columns
neighbourhood 값과 neighbourhood_group 값에 따른 price 의 평균을 계층적 indexing 없이 구하라
df.groupby(['neighbourhood', 'neighbourhood_group'])['price'].mean().unstack()
neighbourhood_group |
Bronx |
Brooklyn |
Manhattan |
Queens |
Staten Island |
neighbourhood |
|
|
|
|
|
Allerton |
87.595238 |
NaN |
NaN |
NaN |
NaN |
Arden Heights |
NaN |
NaN |
NaN |
NaN |
67.25 |
Arrochar |
NaN |
NaN |
NaN |
NaN |
115.00 |
Arverne |
NaN |
NaN |
NaN |
171.779221 |
NaN |
Astoria |
NaN |
NaN |
NaN |
117.187778 |
NaN |
... |
... |
... |
... |
... |
... |
Windsor Terrace |
NaN |
138.993631 |
NaN |
NaN |
NaN |
Woodhaven |
NaN |
NaN |
NaN |
67.170455 |
NaN |
Woodlawn |
60.090909 |
NaN |
NaN |
NaN |
NaN |
Woodrow |
NaN |
NaN |
NaN |
NaN |
700.00 |
Woodside |
NaN |
NaN |
NaN |
85.097872 |
NaN |
221 rows × 5 columns
neighbourhood 값과 neighbourhood_group 값에 따른 price 의 평균을 계층적 indexing 없이 구하고 nan 값은 -999값으로 채워라
df.groupby(['neighbourhood', 'neighbourhood_group'])['price'].mean().unstack().fillna(-999)
neighbourhood_group |
Bronx |
Brooklyn |
Manhattan |
Queens |
Staten Island |
neighbourhood |
|
|
|
|
|
Allerton |
87.595238 |
-999.000000 |
-999.0 |
-999.000000 |
-999.00 |
Arden Heights |
-999.000000 |
-999.000000 |
-999.0 |
-999.000000 |
67.25 |
Arrochar |
-999.000000 |
-999.000000 |
-999.0 |
-999.000000 |
115.00 |
Arverne |
-999.000000 |
-999.000000 |
-999.0 |
171.779221 |
-999.00 |
Astoria |
-999.000000 |
-999.000000 |
-999.0 |
117.187778 |
-999.00 |
... |
... |
... |
... |
... |
... |
Windsor Terrace |
-999.000000 |
138.993631 |
-999.0 |
-999.000000 |
-999.00 |
Woodhaven |
-999.000000 |
-999.000000 |
-999.0 |
67.170455 |
-999.00 |
Woodlawn |
60.090909 |
-999.000000 |
-999.0 |
-999.000000 |
-999.00 |
Woodrow |
-999.000000 |
-999.000000 |
-999.0 |
-999.000000 |
700.00 |
Woodside |
-999.000000 |
-999.000000 |
-999.0 |
85.097872 |
-999.00 |
221 rows × 5 columns
데이터중 neighbourhood_group 값이 Queens값을 가지는 데이터들 중 neighbourhood 그룹별로 price값의 평균, 분산, 최대, 최소값을 구하라
df_queen = df[df.neighbourhood_group == 'Queens']
df_queen.groupby('neighbourhood')['price'].agg(['mean', 'std', 'max', 'min'])
|
mean |
std |
max |
min |
neighbourhood |
|
|
|
|
Arverne |
171.779221 |
193.347902 |
1500 |
35 |
Astoria |
117.187778 |
349.898287 |
10000 |
25 |
Bay Terrace |
142.000000 |
82.561492 |
258 |
32 |
Bayside |
157.948718 |
407.561616 |
2600 |
30 |
Bayswater |
87.470588 |
48.279289 |
230 |
45 |
Belle Harbor |
171.500000 |
90.700449 |
350 |
85 |
Bellerose |
99.357143 |
55.614894 |
240 |
42 |
Breezy Point |
213.333333 |
31.754265 |
250 |
195 |
Briarwood |
105.875000 |
136.026343 |
1000 |
30 |
Cambria Heights |
81.730769 |
54.411438 |
250 |
31 |
College Point |
88.000000 |
80.284217 |
400 |
44 |
Corona |
59.171875 |
49.300682 |
359 |
23 |
Ditmars Steinway |
95.029126 |
64.916198 |
600 |
22 |
Douglaston |
82.750000 |
48.446583 |
178 |
40 |
East Elmhurst |
81.183784 |
66.102101 |
700 |
16 |
Edgemere |
94.727273 |
64.957049 |
200 |
40 |
Elmhurst |
80.459916 |
62.422456 |
443 |
15 |
Far Rockaway |
165.862069 |
229.294527 |
900 |
35 |
Flushing |
93.514085 |
111.432390 |
1500 |
15 |
Forest Hills |
121.625000 |
211.029308 |
2350 |
16 |
Fresh Meadows |
99.500000 |
79.965315 |
375 |
39 |
Glendale |
90.796296 |
56.840850 |
299 |
25 |
Hollis |
88.642857 |
38.509953 |
175 |
50 |
Holliswood |
135.750000 |
71.355331 |
239 |
79 |
Howard Beach |
115.400000 |
65.130153 |
250 |
40 |
Jackson Heights |
80.897849 |
46.373167 |
260 |
23 |
Jamaica |
95.770563 |
89.816742 |
1000 |
10 |
Jamaica Estates |
182.947368 |
170.311634 |
750 |
35 |
Jamaica Hills |
132.125000 |
106.323546 |
325 |
50 |
Kew Gardens |
88.375000 |
51.306008 |
219 |
35 |
Kew Gardens Hills |
112.307692 |
78.934286 |
399 |
40 |
Laurelton |
95.333333 |
65.996435 |
254 |
34 |
Little Neck |
75.200000 |
48.669292 |
149 |
32 |
Long Island City |
127.465549 |
124.617427 |
2000 |
30 |
Maspeth |
83.645455 |
47.155139 |
245 |
11 |
Middle Village |
109.580645 |
59.473677 |
265 |
28 |
Neponsit |
274.666667 |
75.002222 |
350 |
200 |
Ozone Park |
85.274194 |
41.296675 |
200 |
25 |
Queens Village |
83.933333 |
48.851510 |
320 |
25 |
Rego Park |
83.877358 |
60.489700 |
300 |
21 |
Richmond Hill |
87.117021 |
58.941899 |
300 |
28 |
Ridgewood |
77.184397 |
51.276816 |
375 |
20 |
Rockaway Beach |
132.178571 |
79.086169 |
545 |
49 |
Rosedale |
76.694915 |
54.442077 |
350 |
22 |
South Ozone Park |
82.400000 |
67.970506 |
400 |
29 |
Springfield Gardens |
94.235294 |
49.908403 |
300 |
40 |
St. Albans |
100.828947 |
97.416137 |
600 |
25 |
Sunnyside |
84.865014 |
52.227837 |
600 |
12 |
Whitestone |
107.545455 |
116.756468 |
400 |
35 |
Woodhaven |
67.170455 |
40.527335 |
250 |
10 |
Woodside |
85.097872 |
70.724116 |
500 |
28 |
데이터중 neighbourhood_group 값에 따른 room_type 컬럼의 숫자를 구하고 neighbourhood_group 값을 기준으로 각 값의 비율을 구하여라
df_room = df.groupby(['neighbourhood_group', 'room_type'])['room_type'].size().unstack()
df_room.loc[:,:] = df_room.values / df_room.sum(axis = 1).values.reshape(-1,1)
df_room
room_type |
Entire home/apt |
Private room |
Shared room |
neighbourhood_group |
|
|
|
Bronx |
0.347388 |
0.597617 |
0.054995 |
Brooklyn |
0.475478 |
0.503979 |
0.020543 |
Manhattan |
0.609344 |
0.368496 |
0.022160 |
Queens |
0.369926 |
0.595129 |
0.034945 |
Staten Island |
0.471850 |
0.504021 |
0.024129 |