pandas_notes.txt

display options 
https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.set_option.html

# do not truncate if df is smaller than 1000 rows
pd.set_option('display.max_rows', 1000)

df[df.ColumnA == "whatever"]
- returns rows where ColumnA is whatever

df.loc["index", "columna"]
- returns the value matching the given index and given columna

df.describe()
- returns basic stats by column

***************
LOC - filtering rows and coluns by label name
- inclusive for both sides

df.loc[0,:]
- returns first row, all columns (series)

df.loc[0:2, :]
- returns first three rows, all columns (df)

df.loc[:, "columna"]
- returns all rows for columna (series)

df.loc[:, ["columna", "columnb"]]
- returns all rows, column a and b (df)

df.loc[:, ["columna":"columnz"]]
- returns all rows, columns from a to z (df)

df.loc[df.columna == "whatever"]
- returns all rows, where columna is whatever

df.loc[df.columna == "whatever"].columnb
- returns columnb only where columna is whatever (= chained indexing)

df.loc[df.columna == "whatever", "columnb"]
- returns columnb only where columna is whatever (better solution than the one above)

use several where:
df.loc[(df["B"] > 50) & (df["C"] == 900), "A"]

show rows containing substring
df.loc[df.columna.str.contains("mystring")]

select except
df[df["col"].str.contains('this|that')==False]

show duplicate rows – will only show the second duplicate
df["is_duplicate"] = df.duplicated()
df.loc[df.is_duplicate == True]
del df["is_duplicate"]

***************
ILOC - filtering rows and columns by integer position
- exclusive for second value

df.iloc[:, [0,3]]
- returns all rows, and columns 1th and 2nd

df.iloc[:, 0:4]
returns all rows from 1st to 3rd column

***************
IX - filtering rows and columns mixing labels and integer positions
its better not to use this.

df.ix["index label", 0]
- returns first column value for given row

df.ix[1,"columna"]

df.index.name = None

merged = pd.merge(df_new, df_n, left_on='subject_id', right_on='subject_id', how = 'left')

new_header = housing_2000.iloc[0] #grab the first row for the header
housing_2000 = housing_2000[1:] #take the data less the header row

*********
legnth of df
len(df.index)

list of column names: list(my_dataframe.columns.values)

merge(left, right, how='inner', on=None, left_on=user, right_on=Id, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True, indicator=False)

pd.merge(df1, df2, how='inner',left_on='Id', right_on='user', left_index=True, right_index=False, sort=False, copy=False)

pd.merge(user, ert, left_on='user_id', right_on = 'user', how='outer')

df['x'].str.lower()

df = pd.DataFrame(['a|b', 'c|d'])

s = df[0].apply(lambda x: x.split('|'))

df['left'] = s.apply(lambda x: x[0])

df['right'] = s.apply(lambda x: x[1])

df.drop(df[df.ColumnA == "string"].index)

delete rows with missing values in given row:
data.dropna(subset = ['columnA'])
df.dropna(subset = ['column1_name', 'column2_name', 'column3_name'])


groupby several values:
df = df.groupby(['pidx','pidy']).agg({'flag':'first', 'count':'sum'}).reset_index()

normalized:
data["final_norm"] = (data["final"] - data["final"].min()) / (data["final"].max()-data["final"].min())

def rescale(input_array):
    L = numpy.min(input_array)
    H = numpy.max(input_array)
    output_array = (input_array - L) / (H - L)
    return output_array

*********
replace and convert numbers
# format of original object: 1.200
def to_numeric(column):
    df[column] = df[column].apply(lambda x: x.replace('-', '0')) # assuming that 0 are ok for the NaNs
    df[column] = df[column].str.replace('.', '', regex=True).astype(int)

for column in object_to_numeric_column_list:
    to_numeric(column)
*********
apply a function on two columns
def myfunction(x, y):
    if (x == "whatever") & (y == "another value"):
        return 1
    else:
        return 0

df["new_col"] = df.apply(lambda x: myfunction(x.col1, x.col2), axis=1)

*********
def df_minimal_cleaning(df):
    df.columns = df.columns.str.replace('[^a-zA-Z]', '_').str.lower().str.replace('_+', '_')
    #prefix = df.name[0]
    df.columns = [prefix + "_" + str(col) for col in df.columns]
    for col in df.columns:
        if "whatever" in col:
            #do something
    return df

***********

convert string to date
datecolumn_list = ["mydate1", "mydate2"]

def convert_sf_date(datecolumn):
    df[datecolumn] = df[datecolumn].astype(str)
    df[datecolumn] = df[datecolumn].apply(lambda x: x.strip()).replace('-', np.nan)
    df[datecolumn] =  pd.to_datetime(df[datecolumn], format='%d/%m/%Y')
    
for datecolumn in datecolumn_list:
    convert_sf_date(datecolumn)
    
*********

looking for partial string:
df[df['A'].str.contains("hello")]

ignore NaNs:
df.loc[df.a.str.contains("foo", na=False)]

case insensitive:
gtin[gtin['BRAND_NAME'].str.contains("brand_name_x", case = False)]

list of columns
list(my_dataframe.columns.values)

replace stuff in columns
myfile.columns = myfile.columns.str.replace(' ','_').str.lower()

replace all stuff in columns
df.columns.str.replace('[^a-zA-Z]', '_').str.lower().str.replace('_+', '_')


do something in columns in a massive scale:
list_of_dfs = [df1, df2, df3]
colstostrings_list = ["col1", "col2", "col3"]

for df in list_of_dfs:
    for col in colstostrings_list:
        df[col] = df[col].apply(str)


remove whitespaces from header
shorter.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)

add prefix to columns
myfile = myfile.add_prefix("prefix_")

count values of column
data['columnA'].value_counts()

remove last characters in a column
df['colname'] = df['colname'].map(lambda x: str(x)[:-4])

with index sorted:
data['columnA'].value_counts().sort_index()

convert str to date:
df['col'] = pd.to_datetime(df['col'])

convert float to str
df["id"] = df["id"].apply(str)

to_datetime(shorter['Day'])

groupby sum:
data.groupby(by=['account_ID'])['purchases'].sum()

move columns
df = df[['a', 'y', 'b', 'x']]

count distinict
len(data["columnA"].unique())

show rows with Nans
df[df.isnull().any(axis=1)]

show rows with NaNs in a given column
data.loc[pd.isnull(data.columnA)]

show rows with NOT NaNs in a given column
data.loc[pd.notnull(data.columnA)]

replace nans with sg:
df.column1 = df.column1.fillna('some_string')

rename columns
df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'}, inplace=True)

change order of columns
data2 = data2[['Profile Id', 'Profile Name', 'eventCategory', 'eventAction', 'eventLabel', 'date', 'deviceCategory', 'channelGrouping', 'UniqueEvents','BRAND_NAME', 'SUB_SECTOR_NAME']]

group data by date (date is in datetime format)
df.resample('M').mean()

delete rows cointain string
df.drop(df[df.ColumnA == "string"].index, inplace = True)

see all columns, width of df
pd.set_option('display.max_columns', 500)

fill column with nan
df["D"] = np.nan
df["C"] = ""

grand total
df[["Jan","Feb","Mar","total"]].sum()

***
functions in pandas with apply
def my_function(df):
   
    if df["whatever_column"] == "x":
        return some value
    elif df["whatever_column"] == "y":
        return another some value
    else:
        pass
df["my_new_column"] = df.apply(my_function, axis=1)
***

*****************
matrix = pd.read_csv("transposed_matrix.csv", sep = ",", index_col = 0)

matrix.columns
matrix.index.values
for row in matrix.index.values:
    for column in matrix.columns:
        print row, column, matrix.loc[row][column]
*******************
Excel -like features
http://pbpython.com/excel-pandas-comp.html

convert int to datetime
df['DateTime'] = df['date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))

groupby month
df1 = df.set_index(["date"]) # date in datetime
g = df1.groupby(pd.Grouper(freq="M"))
g.size()

OR 
per = df.date.dt.to_period("M")
h = df.groupby(per)
h.size()

df.groupby(pd.TimeGrouper(freq='M'))

groupby several columns
group_data = df.groupby(['Alphabet','Words'])['COUNTER'].sum()

count distinct
df.groupby(['col1','col2'])['col3'].nunique().reset_index()

trim
df[0] = df[0].str.strip()

replace name 
nka['szekhely'].replace({'Hódmezovásárhely': 'Hódmezővásárhely'}, inplace=True, regex=True)

remove, replace value in column
df['name'] = df['Label'].map(lambda x: x.replace('mystring', ''))

replace value based on condition
df.loc[(df.column_where_condition_is == 'valuex'),'column_to_change']='new_value'

unicode miatti rinya:
df.replace({u'Akármi': u'Dr. Akármi'}, inplace=True, regex=True)

print full value_counts:
x = nka_hatarontuli["szekhely_strip"].value_counts()
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
print_full(x)

dropna here a string is whatever:
df.dropna(subset = ['column_name'])

delete column
del df[colname]

scientific notation format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df.userid = df.userid.map(lambda x: '{:.0f}'.format(x))

sort by two columns not ignoring NaNs
df.sort(['A', 'B'], ascending=[True, False], na_position='first')

concat dataframes
concat = pd.concat([df1,df2,df3])

new column with fix value
df['Name'] = 'abc'

if function
data['negative'] =  data['sume'].map(lambda x: 'negative' if x < 0 else 'positive')

replace value based on other condition:
data["company"][data["name"].str.contains("whatever string", na=False, case=False)] = "whatever other string"

merge strings with Nan
df['ColA+ColB'] = df['ColA'].fillna('') + df['ColB'].fillna('')

replace values
df.columnA.replace(['value1', 'value2'], ['new_value1', 'new_value2'], inplace=True)

multi indexing and sorting
#data2.groupby(by=["link_domain"])["link_domain", "engagement_fb"].agg(['mean', 'count']).reset_index().sort_values([("engagement_fb", "mean")], ascending = False).head(20)

give colum names
data = pd.read_csv("path/to/file.txt", sep='\t', header=None)
data.columns = ["Sequence", "Start", "End", "Coverage"]

list_of_what = df["what"].tolist()

check if list is in df:
df[df['A'].isin([3, 6])]


list not in df:
df[~df.column.isin(mylist)]

for index,row in df.iterrows():
    if a < (row["year_avg"] - row["yerr1"]):
          df.set_value(index,'position',"below")
    elif (a > (row["year_avg"] + row["yerr1"])):
          df.set_value(index,'position',"above")
    else:
          df.set_value(index,'position',"around")

drop duplicates by column
cpv.drop_duplicates(subset='wins_id', keep='last')

reorder columns
frame = frame[['column I want first', 'column I want second'...etc.]]

count co ocurrences
df.groupby(["Group", "Size"]).size()

one liner for tableau format:
pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])

remove new lines
with open("my_file.csv", "rb") as csvfile:
    filtered = (line.replace('\n', ' ') for line in csvfile)
    spamreader = csv.reader(filtered, delimiter=';')
    
    
apply function
def my_function(df):
    if whatever:
        retrun x
    elif whatever:
        return y
    else:
        return x

mydf['new_column'] = mydf.apply(my_function, axis=1)


remove non-char characters from header
df.columns = df.columns.str.replace('[^a-zA-Z0-9]', '_').str.lower().str.replace('_+', '_')

replace a part of a string in column
df['new_column'] = df['column_name'].map(lambda x: x.replace('_unwanted_string_', ''))

unpivot
unpivoted = pd.melt(df, id_vars = ["city", "country"], \
        value_vars = ["persons", "companies"], value_name = "numeric_value", var_name = "type")

split a column into two
df[["col_new_1", "col_new_2"]]= df["col_to_split"].str.split("delimiter", expand = True)

convert currency string to float
df["numeric_value"] = df["numeric_value_as_string"].str.replace('$', '', regex=True).str.replace(',', '', regex=True).astype(float) # ,. to be replaced depending how it looks like

************
MySQL
from pandas import DataFrame
from sqlalchemy.engine import create_engine

# dialect+driver://username:password@host:port/database
engine = create_engine("mysql+pymysql://USERNAME:PASSWORD@HOST/DATABASE?charset=utf8mb4")

connection = engine.connect()
results = connection.execute("SELECT * FROM table WHERE LOWER(name) NOT LIKE '%%whatnot%%';")
df = DataFrame(results.fetchall())
df.columns = results.keys()
connection.close()
engine.dispose()
results.close()
df.head()

************


calculate time difference between rows
df['delta'] = (df['tvalue']-df['tvalue'].shift()).fillna(0)

calculate time difference between rows by groups
df_sorted = df.sort_values(["session_id", "date2"])
df_sorted['delta'] = df_sorted.groupby('session_id')['date2'].diff()


*************************
plotting
plot
data.groupby(['year']).agg({'id':'count', 'final':'sum'}).plot(kind = 'bar')

plot average line
plt.axhline(stuff, color = "red")

import matplotlib.pyplot as plt
from matplotlib.pyplot import *
import matplotlib.dates as mdates
fig, ax = subplots()

data.groupby([pd.Grouper(key='date', freq='D'), 'airline_sentiment']).size().unstack().plot(figsize=(15,7), \
  color = ['#FF860C', 'grey', '#0080FF'], linewidth = 2, ax = ax)
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 3))
ax.set_ylabel("number of tweets")
ax.spines['top'].set_visible(False) # remove frame
ax.legend(loc='upper left') # legend in upper left corner
plt.xticks(fontsize = 14, rotation=45) # rotate labels and fontsize

colors
https://matplotlib.org/examples/color/named_colors.html

seaborn colormaps
https://gallantlab.github.io/colormaps.html

plots next to each other
fig, axs = plt.subplots(1,2, figsize=(15, 5))
1: ax = search_merged.plot(x="day", y="zero_results_rate", kind="line", color = "black", ax=axs[0])
2: ax = search_merged.plot(x="day", y="zero_results_rate", kind="line", color = "black", ax=axs[1])

rotate labels for both plots
for ax in fig.axes:
    matplotlib.pyplot.sca(ax)
    plt.xticks(rotation=90);

remove y ticks
ax.set_yticklabels([]) # remove y values
ax.tick_params(bottom="off", left="off") # remove ticks


df1 = df.iloc[::-1] # revers df to have it in alphabetical order
my_colors = ["darkorange", "dodgerblue"]  # version 6
ax = df1.plot(kind='barh',stacked=True, figsize=(10, 7), color=my_colors)
ax.set_ylabel('') # remove y axis title
ax.spines['top'].set_visible(False) # remove frame # remove box
ax.spines['bottom'].set_visible(False) # remove frame
ax.spines['left'].set_visible(False) # remove frame
ax.spines['right'].set_visible(False) # remove frame 
ax.axvline(45, color='black', label = "health quangos average") # add lines average
ax.axvline(50.1, color='grey', linestyle = "--", label = "national average")
ax.legend(loc='center',  bbox_to_anchor=(1.1, 0.5)) # legend in upper left corner;
#ax.figure.savefig('../exports/health/institute_gender_v6_with_national_average_for_text.png', bbox_inches='tight', pad_inches=0.1,
#        frameon=None, dpi=400)
;

add labels to bar charts
for rect in ax.patches:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2., height, '%d' % int(height), ha='center', va='bottom')

labels for horizontal bar charts (barh)
for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_width()+.5, i.get_y(), int(i.get_width()), ha='center')

	original line: ax.text(i.get_width()+.3, i.get_y()+.38, \
            str(round((i.get_width()/total)*100, 2))+'%', fontsize=15, color='dimgrey')
            
treemap
import squarify
import matplotlib.pyplot as plt 
squarify.plot(sizes=df['count'], label=df['group'], alpha=.8 )
plt.axis('off')
plt.show()


hovertemplate
fig = go.Figure()
fig.add_trace(go.Choropleth(locations = df['country'],
                                    z = df["migrants_in_es_by_1000000"],
                                    locationmode = 'country names',
                                    colorscale = colorscale_log_es,
                                    marker_opacity=1, marker_line_width=0.1,
                                    text = df["country"],
                                    hovertemplate = "Out of one million citizens of %{text}, there are <br>" +
                                    "%{z:.} immigrants registered in Spain.<br>" +
                                    "<extra></extra>",
                                    hoverlabel = dict(font = dict(size = 18), bgcolor = "hex_here"),
                                    showscale = False, visible = True))

OR custom text
hoverinfo = "text",
		text = [str(country) + ", " + str(number) for (country, number) in df[['country', 'number']].values],

OR in older Plotly:
text = ["blabla <br>" + str(text) + " is " + str(an_integer) + "<br>blabla <br>" + str(another_integer) + "." for (text, an_integer)  in df[['text', 'an_integer']].values],
*************************
convert "2019-12" YYYY-WW to days, ie. get first and last day of week


def get_days(x):
    year = int(("2019-26").split("-")[0])
    week = int(("2019-26").split("-")[1])
    d = date(year,1,1)
    d = d - timedelta(d.weekday())
    dlt = timedelta(days = (week-1)*7)
    first_day = d + dlt
    last_day = d + dlt + timedelta(days=6)
    monday = first_day.strftime("%Y-%m-%d")
    sunday = last_day.strftime("%Y-%m-%d")
    return monday, sunday
df["monday"], df["sunday"] = zip(*df["year_week"].map(get_days))

*************************
Salesforce data download

import os
import time
timestr = time.strftime("%Y%m%d")

# set in the environment
os.environ["password"] = "whatever"
os.environ["client_id"] = "whatever"
os.environ["client_secret"] = "whatever"
os.environ["username"] = "whatever"


# download data from Salesforce Rest API
def salesforce_to_json():
    params_sbox = {"grant_type": "password",
    "client_id" : os.environ.get("client_id"),
    "client_secret": os.environ.get("client_secret"), # Consumer Secret
    "username": os.environ.get("username"), # The email to login
    "password": os.environ.get("password") # Concat password and security token
    }

    headers = {
            'content-type': 'application/x-www-form-urlencoded'
            }
    
    r = requests.post("MYSALESFORCEDOMAIN/services/oauth2/token",\
        params= params_sbox, headers = headers)
    access_token = r.json().get("access_token")
    instance_url = r.json().get("instance_url")
    session_id = r.json().get("id")

    cookie = {"sid": session_id}
    header = {"Authorization": "Bearer %s" % access_token}
    report_url = "MYSALESFORCEDOMAIN/services/data/v44.0/analytics/reports/MYREPORTID"

    report = requests.get(report_url, headers = header, cookies = cookie)

    data = report.json()

    with open('/tmp/report_' + str(timestr) + '.json', 'w') as outfile:
        json.dump(data, outfile)
        
salesforce_to_json()

*************************


     - One of the following named colorscales:
            ['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
             'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
             'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
             'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
             'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
             'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
             'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
             'orrd', 'oryel', 'peach', 'phase', 'picnic', 'pinkyl', 'piyg',
             'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn', 'puor',
             'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu', 'rdgy',
             'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar', 'spectral',
             'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn', 'tealrose',
             'tempo', 'temps', 'thermal', 'tropic', 'turbid', 'twilight',
             'viridis', 'ylgn', 'ylgnbu', 'ylorbr', 'ylorrd']

*************************
to create dummy variables
to_dummify = pd.get_dummies(dff["to_be_dummy_col"])
dff =  pd.concat([df, to_dummify], axis = 1)
del dff["to_be_dummy_col"]