Skip to content

Data cleaning

Change data type

from char to int
df['grade'] = df['grade'].astype(int)
df = df.astype({
    'A': 'int64',
    'B': 'float64',
    'C': 'datetime64[ns]'
})

Drop duplicate rows

df.drop_duplicates(subset=['A','B','C'], inplace=True, keep='first')

Drop missing data

df=df.dropna(
    axis=0,   # (1)
    how='any' # (2)
    subset=['A','B'],
    inplace=True # (3)
    )
  1. axis{0 or ‘index’, 1 or ‘columns’}, default 0
  2. how{‘any’, ‘all’}, default ‘any’
    • ‘any’ : If any NA values are present, drop that row or column.
    • ‘all’ : If all values are NA, drop that row or column.
  3. Whether to modify the DataFrame rather than creating a new one.

Fill missing data

df[col]=df[col].fillna(value)

Arrange columns

df.sort_values(
    by=['a', 'b'],
    ascending=[True, False],
    inplace=True,
    na_position='last' # (1)
    )
  1. {‘first’, ‘last’}, default ‘last’ Puts NaNs at the beginning if first; last puts NaNs at the end.

Rename columns

df.rename(
    columns={'old_name1':'new_name1', 'old_name2':'new_name2'},
    inplace=True
    )

Modify columns

compute rank
df['rank'] = df['score'].rank(
    method='dense', # (1)
    ascending=False,
    na_option='keep' # (2)
    ).astype(int)
  1. {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}, default ‘average’ How to rank the group of records that have the same value (i.e. ties):
    • average: average rank of the group
    • min: lowest rank in the group
    • max: highest rank in the group
    • first: ranks assigned in order they appear in the array
    • dense: like ‘min’, but rank always increases by 1 between - groups (assign different rank for ties).
  2. {‘keep’, ‘top’, ‘bottom’}, default ‘keep’
    • keep: assign NaN rank to NaN values
    • top: assign lowest rank to NaN values
    • bottom: assign highest rank to NaN values
compute rank
df.assign(
    rank=lambda x: x["date"].rank(method="first")
    )
create new column based on condition
df['level'] = np.where((df['value1'] >= 4) & (df['value2'] >= 10), 'High', 'Low')
create new column based on more than one conditions
# Method 1
df['cat'] = pd.cut(df['val'], [-1,2,5,10], labels=['low', 'medium', 'high'])

# Method 2
conditions = [
    df['income'] < 20000,
    df['income'].between(20000, 50000, inclusive='both'),
    df['income'] > 50000
]
categories = ["Low Salary", "Average Salary", "High Salary"]
df['category'] = np.select(conditions, categories, default="Unknown")

Add new columns

employees['bonus'] = employees['salary']*2
df['bonus'] = df.apply(
    lambda row: row['salary'] if row['employee_id'] % 2 == 1 and not row['employee_name'].startswith('M')
        else 0,
    axis=1
    )
calculate date difference between two date columns
df.assign(
  new_col=lambda x: (
    x.date1 - x.date2
  ).dt.days
)
calculate the # of days delay between each confirmed cases
df.assign(
  day_lag=lambda x: (
    x.groupby(["cola", "colb"], as_index=False)[
      "date"
    ]
    .diff() # date difference
    .shift(-1) # move ahead
    .dt.days # convert into integer
  )
)
assign value based on condition
df.assign(
  value=lambda x: np.where(x.ID == 12345, "10", "20")
)
dynamically set the column name using variable values
df.assign(**{column:lambda x: (x[column].str.upper().str.strip())})

Reset index

df['id']=df.reset_index(drop=True).index+1