简介#
今天在 kaggle 上看到了一个比较有意思的 Dataset,收录了手游 Pokemon 中 721 只神奇宝贝的基本数据,包括 id,名字 (name),类别 (type1),二级分类 (type2),基本属性 (血量, 攻击力, 防御力, 魔攻 Attack, 魔防 Defense, 速度)。
现参考 kaggle 上的一些文章,做 pokemon 类别对其基本属性影响的分析探究。
Pokemon 基本数据概要#
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data_df = pd.read_csv("http://airing.ursb.me/data/Pokemon.csv")
data_df.head()
# 最后两列没有意义,直接删去
data_df = data_df.drop(['Generation', 'Legendary'], 1)
data_df.describe()
# 先看看 HP 与 Attack 之间的关联
sns.jointplot(x="HP", y="Attack", data=data_df);
plt.show()
# 首先看看各 Pokemon 的数量分布
sns.boxplot(y="Total", data=data_df)
plt.show()
# id 和 Total 对属性研究无意义,删去
data_df_2 = data_df.drop(['#', 'Total'], 1)
sns.boxplot(data=data_df_2)
plt.show()
var_int = data_df_2.dtypes[data_df.dtypes=='int64'].index
var_int = var_int[1:]
var_int
Index(['Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], dtype='object')
l_int = len(var_int)
fig = plt.figure(figsize=(13, 8))
for i, val in enumerate(var_int):
fig.add_subplot(3, 3, i+1)
plt.hist(data_df_2[val], bins=50)
plt.title(val)
plt.show()
# 再看看各属性间的相关性
data_df_2.corr()
探索 Pokemon 类别对其属性的影响#
# 统计 Pokemon 的类别
type1 = data_df['Type 1'].unique()
print(type1)
data_type1 = data_df.groupby('Type 1').count()['#']
data_type1.sort_values(ascending=False)
['Grass' 'Fire' 'Water' 'Bug' 'Normal' 'Poison' 'Electric' 'Ground' 'Fairy'
'Fighting' 'Psychic' 'Rock' 'Ghost' 'Ice' 'Dragon' 'Dark' 'Steel' 'Flying']
Type 1
Water 112
Normal 98
Grass 70
Bug 69
Psychic 57
Fire 52
Rock 44
Electric 44
Ground 32
Dragon 32
Ghost 32
Dark 31
Poison 28
Steel 27
Fighting 27
Ice 24
Fairy 17
Flying 4
Name: #, dtype: int64
labels = ['Water', 'Normal', 'Grass', 'Bug', 'Psychic', 'Fire', 'Electric', 'Rock', 'Other']
sizes = [112, 98, 70, 69, 57, 52, 44, 44, 175]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'yellow', 'lightgreen', 'silver', 'white', 'pink']
explode = (0, 0, 0, 0, 0, 0, 0, 0, 0.1)
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title("不同类型神奇宝贝的百分比")
plt.show()
# 首先通过箱线图观察各类 Pokemon 的数量分布
type_to_int_dict = { 'Grass': 0, 'Fire': 1, 'Water': 2, 'Bug': 3, 'Normal': 4,
'Poison': 5, 'Electric': 6, 'Ground': 7, 'Fairy': 8, 'Fighting': 9,
'Psychic' : 10, 'Rock': 11, 'Ghost':12, 'Ice' : 13, 'Dragon': 14,
'Dark': 15, 'Steel': 16, 'Flying': 17}
data_df['Int_Type1'] = data_df['Type 1'].map(type_to_int_dict).astype(int)
sns.set(style="ticks")
fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(ax = ax, x="Int_Type1", y="Total", data=data_df, palette="PRGn")
sns.despine(offset=10, trim=True)
plt.show()
# 可以发现龙类的平均数量是远高出其他水平
data_type1 = pd.melt(data_df_2, id_vars=["Name", "Type 1", "Type 2"], var_name="Stat")
data_type1.head()
plt.figure(figsize=(12,10))
plt.ylim(0, 275)
sns.swarmplot(x="Stat", y="value", data=data_type1, hue="Type 1", split=True, size=7)
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
plt.show()
# 更加直观地通过箱线图观察各类 Pokemon 的属性数值
fig = plt.figure(figsize=(13,24))
for i, col in enumerate(var_int[:6]):
ax1 = fig.add_subplot(6, 1, i + 1)
sns.boxplot(x=data_df['Type 1'], y=data_df_2[col], ax=ax1)
plt.show()
# 可以发现龙类的 Pokemon 攻击力最高,钢铁类的 Pokemon 防御力最强,飞行类的 Pokemon 速度最快。
# 箱线图展示了分位数的位置,小提琴图则展示了任意位置的密度。
# 这里我们再用小提琴图展示上列数据,会更加直观。
# distribution of HP among all types of pokemon
hp_data = data_df[['Name','Type 1','HP']]
hp_data = hp_data.pivot_table(values = 'HP',index = ['Name'], columns = ['Type 1'])
hp_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=hp_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("不同类型神奇宝贝的 HP")
sns.despine(left=True, bottom=True)
plt.show()
# distribution of Attack among all types of pokemon
attack_data = data_df[['Name','Type 1','Attack']]
attack_data = attack_data.pivot_table(values = 'Attack',index = ['Name'], columns = ['Type 1'])
attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("不同类型神奇宝贝的 攻击力")
sns.despine(left=True, bottom=True)
plt.show()
# distribution of Defense among all types of pokemon
defense_data = data_df[['Name','Type 1','Defense']]
defense_data = defense_data.pivot_table(values = 'Defense',index = ['Name'], columns = ['Type 1'])
defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("不同类型神奇宝贝的 防御力")
sns.despine(left=True, bottom=True)
plt.show()
# distribution of Sp.Attack among all types of pokemon
sp_attack_data = data_df[['Name','Type 1','Sp. Atk']]
sp_attack_data = sp_attack_data.pivot_table(values = 'Sp. Atk',index = ['Name'], columns = ['Type 1'])
sp_attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("不同类型神奇宝贝的 特殊攻击力")
sns.despine(left=True, bottom=True)
plt.show()
# distribution of Sp.Defense among all types of pokemon
sp_defense_data = data_df[['Name','Type 1','Sp. Def']]
sp_defense_data = sp_defense_data.pivot_table(values = 'Sp. Def',index = ['Name'], columns = ['Type 1'])
sp_defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("不同类型神奇宝贝的 特殊防御力")
sns.despine(left=True, bottom=True)
plt.show()
# distribution of Speed among all types of pokemon
speed_data = data_df[['Name','Type 1','Speed']]
speed_data = speed_data.pivot_table(values = 'Speed',index = ['Name'], columns = ['Type 1'])
speed_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=speed_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("不同类型神奇宝贝的 速度")
sns.despine(left=True, bottom=True)
plt.show()