eduros93
eduros93
Reputation Top 5%
eduros93
4 Snippets  (182nd place)
Published
1 Channel
Created
4 Channels
Following
Mar 11, 2019
Last Visit
Mar 1, 2019
Registered
104 points  (263rd place)
Reputation
Junior Code Generator
Junior Commenter
Serious Commenter
Junior Autobiographer
Serious Autobiographer

Recent Snippets See all snippets by eduros93

public by eduros93 created Mar 1, 2019  99  3  2  -1

4D+ Data Visualization

plt, pd, sns, faceting, parallel coordinates
# Parallel coordinates
# Each data point is a line showing the value of each dim as it goes from
#   left to right. You can se relation between dimensions, particularly
#   relations with the "class_column", which is the hue variable
# Image in comments
from pandas.plotting import parallel_coordinates
parallel_coordinates(df, class_column='targetCol', 
                      color=('#FFE888', '#FF9999'))
                      
###                      

# 2D faceting of 2D scatterplots (4D)
# Image in comments
g = sns.FacetGrid(data, col="colFacet",  row="rowFacet")
g = g.map(plt.scatter, "xCol", "yCol", edgecolor="w")

###

# Line plot
# Computes the mean of the y variable for each of the x values
#   You can encode a third dim by using a line for each value of the dim
#   You add a 4th (and 5th if you want) with facets
# Image in comments
grid = sns.FacetGrid(df, row='facetRowVar', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'xVar', 'yVar', 'lineVar', palette='deep')
grid.add_legend()
												
;

public by eduros93 created Mar 1, 2019  53  1  1  0

3D Data visualization

plt, pd, sns, faceting, boxplot, pairplot, scatter, scatterplot, density plot
# Pairplot (image in comments)
g = sns.pairplot(train[selectedCols], hue='target', palette = 'seismic',
                size=1.2, diag_kind = 'kde',diag_kws=dict(shade=True),
                plot_kws=dict(s=10))
g.set(xticklabels=[])

###

# Boxplot faceted for 2nd dim and grouped for 3rd dim (image in comments)
sns.boxplot(x="facetCol", y="boxplotCol", hue="hueCol", data=df)

###

# 1st dim facet, 2nd and 3rd dims are the scatter plot
#   the hue dimension is used to give statistical info of the y axis var
#   We discretize the facet variable to have a finite number of facets
#   We discretize the y axis variable to then show the quartiles in the plot
#   Image in comments
df['discreteFacetVar'] = pd.qcut(df['facetVar'], 
                                q=quantile_list, labels=quantile_labels)
df['discreteYaxisVar'] = pd.qcut(df['yaxisVar'], 
                                q=quantile_list, labels=quantile_labels)
g = sns.FacetGrid(df, col="discreteFacetVar", 
                  hue='discreteYaxisVar')
g.map(plt.scatter, "xaxisVar", "yaxisVar", alpha=.7)

###

# Like scatterplot but showing density. Scatter plot 2.0
# 3rd dim is hue, we made a manual hue by plotting twice
#   each time with a different color
# Image in comments
plot1 = sns.kdeplot(df1['col1'], df1['col2'],
                  cmap="YlOrBr", shade=True, shade_lowest=False)
plot2 = sns.kdeplot(df2['col1'], df2['col2'],
                  cmap="Reds", shade=True, shade_lowest=False)
                  
                  
                  
                  
          												
;

public by eduros93 created Mar 1, 2019  46  0  2  0

2D Data Visualization

correlation, heatmap, plt, sns, pd, eda
# 1:
# Correlation matrix (Pearson correlation)
def plotCorr(df):
  plt.figure(figsize=(14,12))
  plt.title('Pearson Correlation of Features', y=1.05, size=15)
  sns.heatmap(df.astype(float).corr(),linewidths=0.1,vmax=1.0, 
              square=True, linecolor='white', annot=True)						
  plt.show()                
  
###   

# 2:
# Focused correlation matrix
#   Apply some condition on the correlation of the cols over the target col
#   before plotting it
corr = df.corr()
mask = (corr["targetCol"] > 0.4) + (corr["targetCol"] < -0.4)
selectedCols = corr.loc[mask].index.values
plotCorr(df[selectedCols])

###				

# 3:
# Joint plot (image in comments)
sns.jointplot(x='col1', y='col2', data=df,
               kind='reg', space=0, size=5, ratio=4)
               
###

# Grouped bar plot
# Perform discrete histogram on column x and group by coulmn on hue
sns.countplot(x="histCol", hue="groupCol", data=df)

###

# Faceted boxplot
# As many boxplots of "y" as values has "x"
sns.boxplot(x="discreteCol", y="continuousCol", data=df)
						
###

# Distplot with 2nd dim as hue (image in comments)
#   We use FacetGrid to encode the hue beacuse distplot doesn't have it
#   (last time I checked anyway)
g = sns.FacetGrid(wines, hue='hueCol')
g.map(sns.distplot, 'histCol', kde=False, bins=15)
;

public by eduros93 created Mar 1, 2019  28  0  1  0

Feature engineering with Pandas

preprocessing, eda, pd
# Length of str
train['Name_length'] = train['Name'].apply(len)

# create binary variable
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# create binary var using filters
dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

# fill na's with median or other statistic
dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

# divide variable into quantiles
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

# divide variable into equal-range bins
train['CategoricalAge'] = pd.cut(train['Age'], 5)

# write a function that receives a value from a series and outputs the value of  a
# new feature
 dataset['Title'] = dataset['Name'].apply(get_title)

# replace problematic values
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')

# replace in batch with a mapping
dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# assign a value to a whole filtered selection
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1

# batch replacing in a column
dataset['Title'] = dataset['Title'].replace(weirdTitlesList, 'Rare')

# fill nan's with specific guesses for subgroups
dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) 
              & (dataset.Pclass == j),'Age'] = myGuess

;