Data cleaning

Change data type¶

from char to int

df['grade'] = df['grade'].astype(int)

df = df.astype({
    'A': 'int64',
    'B': 'float64',
    'C': 'datetime64[ns]'
})

Drop duplicate rows¶

df.drop_duplicates(subset=['A','B','C'], inplace=True, keep='first')

Drop missing data¶

df=df.dropna(
    axis=0,   # (1)
    how='any' # (2)
    subset=['A','B'],
    inplace=True # (3)
    )

axis{0 or ‘index’, 1 or ‘columns’}, default 0
how{‘any’, ‘all’}, default ‘any’
- ‘any’ : If any NA values are present, drop that row or column.
- ‘all’ : If all values are NA, drop that row or column.
Whether to modify the DataFrame rather than creating a new one.

Fill missing data¶

df[col]=df[col].fillna(value)

Arrange columns¶

df.sort_values(
    by=['a', 'b'],
    ascending=[True, False],
    inplace=True,
    na_position='last' # (1)
    )

{‘first’, ‘last’}, default ‘last’ Puts NaNs at the beginning if first; last puts NaNs at the end.

Rename columns¶

df.rename(
    columns={'old_name1':'new_name1', 'old_name2':'new_name2'},
    inplace=True
    )

Modify columns¶

compute rank

df['rank'] = df['score'].rank(
    method='dense', # (1)
    ascending=False,
    na_option='keep' # (2)
    ).astype(int)

{‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}, default ‘average’ How to rank the group of records that have the same value (i.e. ties):
- average: average rank of the group
- min: lowest rank in the group
- max: highest rank in the group
- first: ranks assigned in order they appear in the array
- dense: like ‘min’, but rank always increases by 1 between - groups (assign different rank for ties).
{‘keep’, ‘top’, ‘bottom’}, default ‘keep’
- keep: assign NaN rank to NaN values
- top: assign lowest rank to NaN values
- bottom: assign highest rank to NaN values

compute rank

df.assign(
    rank=lambda x: x["date"].rank(method="first")
    )

create new column based on condition

df['level'] = np.where((df['value1'] >= 4) & (df['value2'] >= 10), 'High', 'Low')

create new column based on more than one conditions

# Method 1
df['cat'] = pd.cut(df['val'], [-1,2,5,10], labels=['low', 'medium', 'high'])

# Method 2
conditions = [
    df['income'] < 20000,
    df['income'].between(20000, 50000, inclusive='both'),
    df['income'] > 50000
]
categories = ["Low Salary", "Average Salary", "High Salary"]
df['category'] = np.select(conditions, categories, default="Unknown")

Add new columns¶

employees['bonus'] = employees['salary']*2

df['bonus'] = df.apply(
    lambda row: row['salary'] if row['employee_id'] % 2 == 1 and not row['employee_name'].startswith('M')
        else 0,
    axis=1
    )

calculate date difference between two date columns

df.assign(
  new_col=lambda x: (
    x.date1 - x.date2
  ).dt.days
)

calculate the # of days delay between each confirmed cases

df.assign(
  day_lag=lambda x: (
    x.groupby(["cola", "colb"], as_index=False)[
      "date"
    ]
    .diff() # date difference
    .shift(-1) # move ahead
    .dt.days # convert into integer
  )
)

assign value based on condition

df.assign(
  value=lambda x: np.where(x.ID == 12345, "10", "20")
)

dynamically set the column name using variable values

df.assign(**{column:lambda x: (x[column].str.upper().str.strip())})

Reset index¶

df['id']=df.reset_index(drop=True).index+1