import numpy as np
import pandas as pd
import random
42)
random.seed(
= {
data "column1": ["A", "B", "C", "D", "E"] * 100,
}
= pd.DataFrame(data) df
If you need to fill a column with a random value, here’s how you do it
There’s 100 of each value in column1.
Generating Random Values in a Column
Level 1: List
Let’s start with this simple scenario. You have one column with a value, and you want to fill another column with a random value from a list.
"random"] = df["column1"].apply(
df[lambda x: random.choice(["Apple", "Banana", "Carrot", "Dog", "Elephant"])
)
"random"].value_counts() df[
random
Apple 107
Banana 106
Elephant 101
Carrot 98
Dog 88
Name: count, dtype: int64
Level 2: Dictionary
Now let’s say you want the two columns to speak to each other. You want to fill column2 with a random value based on the value in column1.
= {
df_dict "A": ["Apple", "Ant", "Aardvark"],
"B": ["Banana", "Bee", "Bear"],
"C": ["Carrot", "Cat", "Cheetah"],
"D": ["Dog", "Dolphin", "Duck"],
"E": ["Elephant", "Eagle", "Eel"],
}
"random_dict"] = df["column1"].apply(lambda x: random.choice(df_dict[x]))
df[
"column1")["random_dict"].value_counts() df.groupby(
column1 random_dict
A Aardvark 38
Ant 35
Apple 27
B Bee 35
Bear 34
Banana 31
C Cheetah 37
Carrot 33
Cat 30
D Dolphin 43
Dog 32
Duck 25
E Elephant 36
Eagle 32
Eel 32
Name: count, dtype: int64
Level 3: Weights
Now let’s say you want to fill column2 with a random value, but you want to control the ratio of the random values.
The weights need to add up to 1.
"random_weights"] = df["column1"].apply(
df[lambda x: random.choices(
"Apple", "Banana", "Carrot", "Dog", "Elephant"],
[=[0.4, 0.3, 0.1, 0.1, 0.1],
weights0]
)[
)
"random_weights"].value_counts() df[
random_weights
Apple 203
Banana 156
Elephant 53
Carrot 47
Dog 41
Name: count, dtype: int64
Level 4: Custom Weights
But what if you want to control the ratio of the random values on a per-condition basis?
= {
df_dict "A": {"values": ["Apple", "Ant", "Aardvark"], "weights": [0.7, 0.2, 0.1]},
"B": {"values": ["Banana", "Bee", "Bear"], "weights": [0.6, 0.3, 0.1]},
"C": {"values": ["Carrot", "Cat", "Cheetah"], "weights": [0.5, 0.3, 0.2]},
"D": {"values": ["Dog", "Dolphin", "Duck"], "weights": [0.6, 0.2, 0.2]},
"E": {"values": ["Elephant", "Eagle", "Eel"], "weights": [0.8, 0.1, 0.1]},
}
"random_custom_weights"] = df["column1"].apply(
df[lambda x: random.choices(df_dict[x]["values"], weights=df_dict[x]["weights"])[0]
)
"column1")["random_custom_weights"].value_counts() df.groupby(
column1 random_custom_weights
A Apple 71
Ant 17
Aardvark 12
B Banana 66
Bee 20
Bear 14
C Carrot 51
Cat 25
Cheetah 24
D Dog 63
Dolphin 25
Duck 12
E Elephant 81
Eel 11
Eagle 8
Name: count, dtype: int64
Reverse Engineering Random
But what if you want to reverse engineer the weights? Maybe you need to add more of a certain value to balance out the distribution, or add more lines to a database that match the distribution of the original data.
Reverse Engineering Weights
Let’s reverse engineer the weights for the random_weights
column.
= df["random_weights"].value_counts().reset_index()
new_weights
"weight"] = (new_weights["count"] / new_weights["count"].sum()).round(1)
new_weights[
= new_weights["random_weights"].tolist()
values = new_weights["weight"].tolist()
weights
print(values)
print(weights)
['Apple', 'Banana', 'Elephant', 'Carrot', 'Dog']
[0.4, 0.3, 0.1, 0.1, 0.1]
Reverse Engineering Custom Weights
Let’s reverse engineer the weights for the random_custom_weights
column.
= (
new_weights "column1")["random_custom_weights"].value_counts()
df.groupby(# .unstack()
.reset_index()
)
"sum"] = new_weights.groupby("column1")["count"].transform("sum")
new_weights["weight"] = (new_weights["count"] / new_weights["sum"]).round(1)
new_weights[=["sum"], inplace=True)
new_weights.drop(columns
= (
new_dict "column1")[["random_custom_weights", "weight"]]
new_weights.groupby(apply(lambda x: x.to_dict(orient="records"))
.
.to_dict()
)
= {}
new_df_dict
for key, value in new_dict.items():
= []
values = []
weights
for item in value:
"random_custom_weights"])
values.append(item["weight"])
weights.append(item[
= {"values": values, "weights": weights}
new_df_dict[key]
new_df_dict
{'A': {'values': ['Apple', 'Ant', 'Aardvark'], 'weights': [0.7, 0.2, 0.1]},
'B': {'values': ['Banana', 'Bee', 'Bear'], 'weights': [0.7, 0.2, 0.1]},
'C': {'values': ['Carrot', 'Cat', 'Cheetah'], 'weights': [0.5, 0.2, 0.2]},
'D': {'values': ['Dog', 'Dolphin', 'Duck'], 'weights': [0.6, 0.2, 0.1]},
'E': {'values': ['Elephant', 'Eel', 'Eagle'], 'weights': [0.8, 0.1, 0.1]}}
Using this new dictionary, we can generate results that match the origional distribution.
"random_reversed_weights"] = df["column1"].apply(
df[lambda x: random.choices(
"values"], weights=new_df_dict[x]["weights"]
new_df_dict[x][0]
)[
)
"column1")["random_reversed_weights"].value_counts() df.groupby(
column1 random_reversed_weights
A Apple 67
Ant 25
Aardvark 8
B Banana 68
Bee 18
Bear 14
C Carrot 64
Cat 22
Cheetah 14
D Dog 62
Dolphin 21
Duck 17
E Elephant 79
Eagle 11
Eel 10
Name: count, dtype: int64
Conclusion
Randomness is a powerful tool in Python, pandas, and data science.