From aa377aca70fc89f11bf8c12932ac865dd01f83c7 Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Tue, 26 Mar 2024 12:52:27 -0400
Subject: [PATCH 1/9] had to edit the gitignore to get the generator and
 domains.txt to commit

---
 .gitignore                                |  22 +-
 demos/data/scripts/generator/PersonGen.py | 355 ++++++++++++++++++++++
 demos/data/scripts/generator/domains.txt  |  98 ++++++
 3 files changed, 474 insertions(+), 1 deletion(-)
 create mode 100644 demos/data/scripts/generator/PersonGen.py
 create mode 100644 demos/data/scripts/generator/domains.txt

diff --git a/.gitignore b/.gitignore
index f8a1ee9544..7fe7d02743 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,27 @@ lib64/
 parts/
 sdist/
 var/
-data/
+#data/
+data/benchmarking/
+data/img/
+data/appearances.txt
+data/characters.txt
+data/comics.txt
+data/facebook_combined.txt
+data/honeypot.csv
+data/lesmiserables.csv
+data/samplegraph.json
+data/transactions.csv
+data/twitterDemo.csv
+data/demos_by_use_case/
+data/demos_databases_apis
+data/gfql/
+data/more_examples/
+data/talks/
+data/for_analysis.ipynb
+data/for_developers.ipynb
+data/upload_csv_miniapp.ipynb
+
 *.egg-info/
 .installed.cfg
 *.egg
diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py
new file mode 100644
index 0000000000..0a676827a2
--- /dev/null
+++ b/demos/data/scripts/generator/PersonGen.py
@@ -0,0 +1,355 @@
+import pandas as pd
+import random
+from faker import Faker
+from random_address import real_random_address
+from phone_gen import PhoneNumber
+from datetime import timedelta
+from datetime import datetime
+import numpy as np
+from itertools import count
+from names_dataset import NameDataset
+
+
+class PersonGenerator:
+
+    def __init__(
+            self,
+            seed: int = 0,
+            country: str = 'US',
+            people_amt: int = 100,
+            affiliations: list = ['Gang Alpha', 'Cartel Beta', 'Gang Gamma', 'Cartel Delta'],
+            crimes: list = [
+                        "Armed Robbery",
+                        "Burglary",
+                        "Drug Trafficking",
+                        "Vandalism",
+                        "Assault",
+                        "Money Laundering",
+                        "Fraud",
+                        "Homicide",
+                    ]
+            ):
+
+        self.seed = seed
+        self.country = country
+        self.people_num = people_amt
+        Faker.seed(self.seed)
+        np.random.seed(self.seed)
+        self.fake = Faker()
+        self.random = random.Random(self.seed)
+        self.phone = PhoneNumber(self.country)
+        self.names = NameDataset()
+        self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country]
+        self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country]
+        self.address = real_random_address.RandomAddress()
+        self.domains = pd.read_csv("domains.txt", header=None)[0].to_list()
+        self.affiliations = affiliations
+        self.crimes = crimes
+
+    def generate_people(
+            self,
+            num_records: int = 100,
+            min_age: int = 15,
+            max_age: int = 85
+            ) -> pd.DataFrame:
+
+        records = []
+        for _ in range(num_records):
+            gender = ["M", "F"]
+            sex = self.random.choice(gender)
+            record = {
+                "first_name": self.random.choice(self.first_names[sex]),
+                "last_name": self.random.choice(self.last_names),
+                "phone_number": self.phone.get_number(full=False),
+                "sex": sex,
+                "DOB": self.fake.date_of_birth(
+                                        minimum_age=min_age,
+                                        maximum_age=max_age
+                                        ),
+            }
+            record["email_address"] = record["first_name"] + record["last_name"] + str(self.random.randint(0, 999)) + self.random.choice(self.domains)
+            records.append(record)
+        
+        df = pd.DataFrame(records)
+        return df
+
+    def generate_addresses(
+            self,
+            num_records: int = 100,
+            start_date: str = "-30y",
+            end_date: str = "today"
+            ) -> pd.DataFrame:
+
+        records = []
+        for _ in range(num_records):
+            address = self.address()
+
+            record = {
+                "address1": address.get('address1', ''),
+                "address2": address.get('address2', ''),
+                "city": address.get("city", "Unknown City"),
+                "date": self.fake.date_between(start_date=start_date, end_date=end_date),
+                "state": address.get("state", "Unknown State"),
+                "zip": address.get("postalCode", "Unknown PostalCode"),
+                "lat": address.get("coordinates", {}).get("lat", 0.0),
+                "lon": address.get("coordinates", {}).get("lng", 0.0)
+            }
+            records.append(record)
+        df = pd.DataFrame(records)
+        return df
+
+    def generate_call_logs(
+            self,
+            people_df: pd.DataFrame,
+            num_logs: int = 500,
+            start_date: str = '-1y'
+            ) -> pd.DataFrame:
+        
+        call_logs = []
+        phone_numbers = people_df['phone_number'].tolist()
+        
+        for _ in range(num_logs):
+            caller, callee = self.random.sample(phone_numbers, 2)  # Ensure caller and callee are different
+            call_date = self.fake.date_time_between(start_date=start_date)
+            call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59))
+            duration = self.random.randint(1, 3600)  # Call duration in seconds, from 1 sec to 1 hour
+            
+            call_logs.append({
+                "caller": caller,
+                "callee": callee,
+                "call_date": call_date.strftime('%Y-%m-%d'),
+                "call_time": call_time.strftime('%H:%M:%S'),
+                "duration_sec": duration
+            })
+        
+        return pd.DataFrame(call_logs)
+
+    def generate_non_affiliated_call_logs(
+            self,
+            people_df: pd.DataFrame,
+            call_logs_df: pd.DataFrame,
+            num_calls: int = 500,
+            start_date: str = '-1y'
+            ) -> pd.DataFrame:
+        """
+        Generate call logs for non-affiliated individuals, simulating everyday calls.
+
+        :param people_df: DataFrame of people with affiliations.
+        :param call_logs_df: DataFrame of call logs to append to.
+        :param num_calls: Number of calls to generate among non-affiliated individuals.
+        :return: Updated DataFrame with non-affiliated call logs.
+        """
+        # Filter for non-affiliated individuals
+        non_affiliated_people = people_df[people_df['affiliation'] == 'None']
+        
+        # Generate call logs
+        for _ in range(num_calls):
+            if len(non_affiliated_people) > 1:
+                caller, callee = non_affiliated_people.sample(n=2, replace=False)['phone_number'].values
+                self.add_call_log(call_logs_df, caller, callee, start_date)
+
+        return call_logs_df
+
+    def generate_affiliated_call_logs(
+            self,
+            people_df: pd.DataFrame,
+            call_logs_df: pd.DataFrame,
+            num_affiliated_calls: int = 100,
+            leader_call_percentage: float = 0.05,
+            start_date: str = '-1y'
+            ) -> pd.DataFrame:
+        """
+        Generate call logs with a focus on gang affiliations, including both intra-gang and inter-gang communications.
+        
+        :param people_df: DataFrame of people with affiliations.
+        :param call_logs_df: Existing DataFrame of call logs to append to.
+        :param num_affiliated_calls: Number of additional affiliated calls to generate.
+        :param leader_call_percentage: Percentage of calls that should be between gang leaders (inter-gang calls).
+        :return: Updated DataFrame with affiliated call logs.
+        """
+        affiliated_people = people_df[people_df['affiliation'] != 'None']
+        affiliated_groups = affiliated_people['affiliation'].unique()
+        
+        leader_calls = int(num_affiliated_calls * leader_call_percentage)
+        gang_calls = num_affiliated_calls - leader_calls
+
+        # Generate intra-gang calls
+        for _ in range(gang_calls):
+            gang = self.random.choice(affiliated_groups)
+            gang_members = affiliated_people[affiliated_people['affiliation'] == gang]
+            
+            if len(gang_members) > 1:
+                caller, callee = gang_members.sample(n=2, replace=False)['phone_number'].values
+                self.add_call_log(call_logs_df, caller, callee, start_date)
+        
+        # Generate inter-gang calls (leader calls)
+        for _ in range(leader_calls):
+            gangs = self.random.sample(list(affiliated_groups), 2)
+            for gang in gangs:
+                gang_leader = affiliated_people[affiliated_people['affiliation'] == gang].sample(n=1)['phone_number'].values[0]
+                if gang == gangs[0]:
+                    caller = gang_leader
+                else:
+                    callee = gang_leader
+            self.add_call_log(call_logs_df, caller, callee, start_date)
+
+        return call_logs_df
+
+    def add_call_log(
+            self,
+            call_logs_df: pd.DataFrame,
+            caller: str,
+            callee: str,
+            start_date: str
+            ) -> pd.DataFrame:
+        """
+        Helper function to add a call log entry to the DataFrame.
+        """
+        call_date = self.fake.date_time_between(start_date=start_date)
+        call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59))
+        duration = self.random.randint(1, 3600)  # Duration in seconds, from 1 sec to 1 hour
+        
+        new_entry = pd.DataFrame([{
+            "caller": caller,
+            "callee": callee,
+            "call_date": call_date.strftime('%Y-%m-%d'),
+            "call_time": call_time.strftime('%H:%M:%S'),
+            "duration_sec": duration
+        }])
+
+        return pd.concat([call_logs_df, new_entry], ignore_index=True)
+    
+    def generate_affiliations(
+            self,
+            people_df: pd.DataFrame,
+            percentage_affiliated: float = 0.1,
+            lambda_param: float = 1.5
+            ) -> pd.DataFrame:
+        """
+        Generate affiliations for a subset of the provided DataFrame of people.
+        
+        :param people_df: DataFrame of people.
+        :param percentage_affiliated: Approximate percentage of people to have affiliations.
+        :param lambda_param: Lambda parameter for the exponential distribution, controlling affiliation spread.
+        :return: Updated DataFrame with an 'affiliation' column.
+        """
+        num_people = len(people_df)
+        num_affiliated = int(num_people * percentage_affiliated)
+        
+        # Determine number of people affiliated with each group, ensuring sum equals num_affiliated
+        affiliation_counts = np.random.exponential(lambda_param, len(self.affiliations))
+        affiliation_counts = np.round((affiliation_counts / affiliation_counts.sum()) * num_affiliated).astype(int)
+        
+        # Adjust in case rounding errors cause a mismatch in total counts
+        while affiliation_counts.sum() != num_affiliated:
+            if affiliation_counts.sum() > num_affiliated:
+                affiliation_counts[np.argmax(affiliation_counts)] -= 1
+            else:
+                affiliation_counts[np.argmin(affiliation_counts)] += 1
+        # Assign affiliations to randomly selected people
+        people_df['affiliation'] = 'None'
+        already_selected = set()
+        for count, affiliation in zip(affiliation_counts, self.affiliations):
+            eligible_indices = [i for i in range(num_people) if i not in already_selected]
+            selected_indices = self.random.sample(eligible_indices, count)
+            people_df.loc[selected_indices, 'affiliation'] = affiliation
+            already_selected.update(selected_indices)
+
+        return people_df
+
+    def assign_whereabouts_to_people(
+            self,
+            people_df: pd.DataFrame,
+            addresses_df: pd.DataFrame,
+            percent_cohabitating: float = 0.2
+            ) -> pd.DataFrame:
+        # Initially, each person gets a unique address by default (if enough addresses)
+        if len(addresses_df) >= len(people_df):
+            people_df = pd.concat([people_df, addresses_df.sample(len(people_df)).reset_index(drop=True)], axis=1)
+        else:
+            raise ValueError("Not enough addresses to assign to each person uniquely.")
+
+        # Identify gang-affiliated individuals for potential cohabitation
+        affiliated_groups = people_df[people_df['affiliation'] != 'None']['affiliation'].unique()
+
+        for gang in affiliated_groups:
+            gang_members = people_df[people_df['affiliation'] == gang]
+            # Decide on how many addresses to group gang members at (e.g., 20% of gang members share addresses)
+            num_addresses = int(len(gang_members) * percent_cohabitating)
+            shared_addresses = addresses_df.sample(num_addresses)
+
+            for idx, address in shared_addresses.iterrows():
+                # Randomly select gang members to live together
+                members_to_live_together = gang_members.sample(n=2 if len(gang_members) > 1 else 1)  # At least 2 if possible
+                for _, member in members_to_live_together.iterrows():
+                    people_df.loc[member.name, ['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']] = address[['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']]
+
+                # Remove the selected members to avoid reselection
+                gang_members = gang_members.drop(members_to_live_together.index)
+
+        return people_df
+    
+    def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame:
+        # Create columns for case details
+        max_crimes_per_case = 3  # Adjust based on your dataset
+        for i in range(max_crimes_per_case):
+            people_df[f'case_number_{i+1}'] = None
+            for j in range(max_crimes_per_case):
+                people_df[f'crime_{i+1}_{j+1}'] = None
+
+        for index, row in people_df.iterrows():
+            for i, case in enumerate(row['cases']):
+                if i < max_crimes_per_case:
+                    people_df.at[index, f'case_number_{i+1}'] = case['case_number']
+                    for j, crime in enumerate(case['crimes']):
+                        if j < max_crimes_per_case:
+                            people_df.at[index, f'crime_{i+1}_{j+1}'] = crime
+
+        # Drop the original 'cases' column if no longer needed
+        # people_df.drop('cases', axis=1, inplace=True)
+        
+        return people_df
+
+    def generate_and_assign_criminal_records(
+            self,
+            people_df: pd.DataFrame,
+            max_cases_per_person: int = 3
+            ) -> pd.DataFrame:
+        unique_case_number = count(start=1000, step=1)  # Unique case number generator
+        criminal_records = []  # To collect criminal record entries
+        gang_related_cases = {}  # To track gang-related case numbers and crimes
+
+        for index, person in people_df.iterrows():
+            num_cases = self.random.randint(0, max_cases_per_person)  # Decide how many cases, if any
+            records_for_person = {"person_id": index, "cases": []}
+
+            for _ in range(num_cases):
+                # Determine if this case is shared (for gang members) or unique
+                if person['affiliation'] != 'None' and gang_related_cases.get(person['affiliation']) and self.random.random() < 0.3:
+                    # Share an existing case
+                    shared_case = self.random.choice(gang_related_cases[person['affiliation']])
+                    records_for_person["cases"].append(shared_case)
+                else:
+                    # Create a new case with 1 or more crimes
+                    case_num = next(unique_case_number)
+                    crimes_in_case = self.random.sample(self.crimes, self.random.randint(1, min(3, len(self.crimes))))  # Up to 3 crimes per case, adjust as needed
+                    new_case = {"case_number": case_num, "crimes": crimes_in_case}
+                    records_for_person["cases"].append(new_case)
+                    
+                    # If gang-affiliated, add this case to the gang's record for potential sharing
+                    if person['affiliation'] != 'None':
+                        if person['affiliation'] not in gang_related_cases:
+                            gang_related_cases[person['affiliation']] = []
+                        gang_related_cases[person['affiliation']].append(new_case)
+
+            criminal_records.append(records_for_person)
+
+        # Convert to DataFrame and merge
+        criminal_records_df = pd.DataFrame(criminal_records)
+        people_df = pd.merge(people_df, criminal_records_df, how='left', left_index=True, right_on='person_id')
+        people_df.drop('person_id', axis=1, inplace=True)
+
+        # Handle individuals with no criminal records
+        people_df['cases'] = people_df['cases'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else [])
+
+        return self.expand_cases_to_columns(people_df)
\ No newline at end of file
diff --git a/demos/data/scripts/generator/domains.txt b/demos/data/scripts/generator/domains.txt
new file mode 100644
index 0000000000..b728c28628
--- /dev/null
+++ b/demos/data/scripts/generator/domains.txt
@@ -0,0 +1,98 @@
+@gmail.com
+@yahoo.com
+@hotmail.com
+@aol.com
+@hotmail.co.uk
+@hotmail.fr
+@msn.com
+@yahoo.fr
+@wanadoo.fr
+@orange.fr
+@comcast.net
+@yahoo.co.uk
+@yahoo.com.br
+@yahoo.co.i
+@live.com
+@rediffmail.com
+@free.fr
+@gmx.de
+@web.de
+@yandex.ru
+@ymail.com
+@libero.it
+@outlook.com
+@uol.com.br
+@bol.com.br
+@mail.ru
+@cox.net
+@hotmail.it
+@sbcglobal.net
+@sfr.fr
+@live.fr
+@verizon.net
+@live.co.uk
+@googlemail.co
+@yahoo.eu
+@ig.com.br
+@live.nl
+@bigpond.com
+@terra.com.br
+@yahoo.itdomains
+@alice.it
+@rocketmail.com
+@att.net
+@laposte.net
+@facebook.com
+@bellsouth.net
+@yahoo.in
+@hotmail.es
+@charter.net
+@yahoo.ca
+@yahoo.com.au
+@rambler.ru
+@hotmail.de
+@tiscali.i
+@shaw.co
+@yahoo.co.jp
+@sky.co
+@earthlink.net
+@optonline.net
+@freenet.de
+@t-online.de
+@aliceadsl.fr
+@virgilio.it
+@home.nl
+@qq.com
+@telenet.be
+@me.com
+@yahoo.com.ar
+@tiscali.co.uk
+@yahoo.com.mx
+@voila.fr
+@gmx.net
+@mail.com
+@planet.nl
+@tin.it
+@live.it
+@ntlworld.com
+@arcor.de
+@yahoo.co.id
+@frontiernet.net
+@hetnet.nl
+@live.com.au
+@yahoo.com.sg
+@zonnet.nl
+@club-internet.fr
+@juno.com
+@optusnet.com.au
+@blueyonder.co.uk
+@bluewin.ch
+@skynet.be
+@sympatico.ca
+@windstream.net
+@mac.com
+@centurytel.net
+@chello.nl
+@live.ca
+@aim.com
+@bigpond.net.au

From bec3343c6355a891c344dfd5c419220a024dc35a Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:02:06 -0400
Subject: [PATCH 2/9] adding doctrings

---
 demos/data/scripts/generator/PersonGen.py | 50 +++++++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py
index 0a676827a2..34ec25f3c0 100644
--- a/demos/data/scripts/generator/PersonGen.py
+++ b/demos/data/scripts/generator/PersonGen.py
@@ -53,6 +53,13 @@ def generate_people(
             max_age: int = 85
             ) -> pd.DataFrame:
 
+        """
+        Generate a set of people records with basic information.
+        :param num_records: Number of records to generate.
+        :param min_age: Minimum age for date of birth generation.
+        :param max_age: Maximum age for date of birth generation.
+        :return: DataFrame of people records.
+        """
         records = []
         for _ in range(num_records):
             gender = ["M", "F"]
@@ -80,6 +87,13 @@ def generate_addresses(
             end_date: str = "today"
             ) -> pd.DataFrame:
 
+        """
+        Generate addresses for a set of people, simulating a history of addresses.
+        :param num_records: Number of addresses to generate.
+        :param start_date: Start date for address history.
+        :param end_date: End date for address history.
+        :return: DataFrame of addresses.
+        """
         records = []
         for _ in range(num_records):
             address = self.address()
@@ -104,7 +118,13 @@ def generate_call_logs(
             num_logs: int = 500,
             start_date: str = '-1y'
             ) -> pd.DataFrame:
-        
+        """
+        Generate call logs for a set of people, simulating everyday calls.
+        :param people_df: DataFrame of people with affiliations.
+        :param num_logs: Number of call logs to generate.
+        :param start_date: Start date for call logs.
+        :return: DataFrame of call logs.
+        """
         call_logs = []
         phone_numbers = people_df['phone_number'].tolist()
         
@@ -263,6 +283,15 @@ def assign_whereabouts_to_people(
             addresses_df: pd.DataFrame,
             percent_cohabitating: float = 0.2
             ) -> pd.DataFrame:
+        """
+        Assign addresses to people, ensuring that gang-affiliated individuals may share addresses.
+
+        :param people_df: DataFrame of people.
+        :param addresses_df: DataFrame of addresses.
+        :param percent_cohabitating: Percentage of gang-affiliated individuals who share addresses.
+        :return: Updated DataFrame with address details.
+        """
+
         # Initially, each person gets a unique address by default (if enough addresses)
         if len(addresses_df) >= len(people_df):
             people_df = pd.concat([people_df, addresses_df.sample(len(people_df)).reset_index(drop=True)], axis=1)
@@ -288,8 +317,15 @@ def assign_whereabouts_to_people(
                 gang_members = gang_members.drop(members_to_live_together.index)
 
         return people_df
-    
+
     def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Helper function to expand the 'cases' column into multiple columns for case details.
+
+        :param people_df: DataFrame of people with 'cases' column.
+        :return: Updated DataFrame with expanded case details.
+        """
+
         # Create columns for case details
         max_crimes_per_case = 3  # Adjust based on your dataset
         for i in range(max_crimes_per_case):
@@ -307,7 +343,7 @@ def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame:
 
         # Drop the original 'cases' column if no longer needed
         # people_df.drop('cases', axis=1, inplace=True)
-        
+
         return people_df
 
     def generate_and_assign_criminal_records(
@@ -315,6 +351,14 @@ def generate_and_assign_criminal_records(
             people_df: pd.DataFrame,
             max_cases_per_person: int = 3
             ) -> pd.DataFrame:
+        
+        """
+        Generate criminal records for a subset of the provided DataFrame of people.
+
+        :param people_df: DataFrame of people with affiliations.
+        :param max_cases_per_person: Maximum number of cases to generate for each person.
+        :return: Updated DataFrame with criminal records.
+        """
         unique_case_number = count(start=1000, step=1)  # Unique case number generator
         criminal_records = []  # To collect criminal record entries
         gang_related_cases = {}  # To track gang-related case numbers and crimes

From 2f564bee64f0c57e2c5818d02bf3db7ea17c0609 Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:19:16 -0400
Subject: [PATCH 3/9] adding state and zipcode address generation

---
 demos/data/scripts/generator/PersonGen.py | 35 ++++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py
index 34ec25f3c0..d7227b09e3 100644
--- a/demos/data/scripts/generator/PersonGen.py
+++ b/demos/data/scripts/generator/PersonGen.py
@@ -41,7 +41,7 @@ def __init__(
         self.names = NameDataset()
         self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country]
         self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country]
-        self.address = real_random_address.RandomAddress()
+
         self.domains = pd.read_csv("domains.txt", header=None)[0].to_list()
         self.affiliations = affiliations
         self.crimes = crimes
@@ -80,23 +80,50 @@ def generate_people(
         df = pd.DataFrame(records)
         return df
 
+    def get_address(self) -> dict:
+        # Placeholder for your existing `self.address()` method
+        return real_random_address.RandomAddress()
+
+    def get_address_by_state(self, state: str) -> dict:
+        # Placeholder for generating an address by state
+        # Implement actual functionality here
+        return real_random_address.real_random_address_by_state(state)
+
+
+    def get_address_by_postal_code(self, postal_code: str) -> dict:
+        # Placeholder for generating an address by postal code
+        return real_random_address.real_random_address_by_postal_code(postal_code)
+
     def generate_addresses(
             self,
             num_records: int = 100,
             start_date: str = "-30y",
-            end_date: str = "today"
+            end_date: str = "today",
+            state: str = None,
+            postal_code: str = None
             ) -> pd.DataFrame:
-
         """
         Generate addresses for a set of people, simulating a history of addresses.
         :param num_records: Number of addresses to generate.
         :param start_date: Start date for address history.
         :param end_date: End date for address history.
+        :param state: Optional state to generate addresses for.
+        :param postal_code: Optional postal code to generate addresses for.
         :return: DataFrame of addresses.
+        :raises ValueError: If both state and postal_code are provided.
         """
+
+        if state and postal_code:
+            raise ValueError("Cannot specify both state and postal code. Please choose one.")
+
         records = []
         for _ in range(num_records):
-            address = self.address()
+            if state:
+                address = self.get_address_by_state(state)
+            elif postal_code:
+                address = self.get_address_by_postal_code(postal_code)
+            else:
+                address = self.get_address()
 
             record = {
                 "address1": address.get('address1', ''),

From 9919a12db26326e61c3221185d3f6c23fff3ef59 Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:25:26 -0400
Subject: [PATCH 4/9] adding class doctring

---
 demos/data/scripts/generator/PersonGen.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py
index d7227b09e3..d91def8ad7 100644
--- a/demos/data/scripts/generator/PersonGen.py
+++ b/demos/data/scripts/generator/PersonGen.py
@@ -30,6 +30,18 @@ def __init__(
                     ]
             ):
 
+        """
+        PersonGenerator:
+        A class to generate synthetic person records;
+        including basic information, addresses, call logs, affiliations, and criminal records.
+        
+        :param seed: Seed for random number generation.
+        :param country: Country code for phone number generation.
+        :param people_amt: Number of people to generate.
+        :param affiliations: List of affiliations to generate.
+        :param crimes: List of crimes to generate.
+        """
+
         self.seed = seed
         self.country = country
         self.people_num = people_amt

From e180e9d40004af43301e015d450f876cd4c9b8ff Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:50:49 -0400
Subject: [PATCH 5/9] adding data synth libs to setup.py

---
 setup.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c81db1b09c..425f68eb4d 100755
--- a/setup.py
+++ b/setup.py
@@ -43,19 +43,28 @@ def unique_flatten_dict(d):
     'jupyter': ['ipython'],
 }
 
+base_extras_data = {
+    'names-dataset': ['names-dataset'],
+    'Faker': ['Faker'],
+    'random-address': ['random-address'],
+    'phone-gen': ['phone-gen'],
+}
+
 base_extras_heavy = {
   'umap-learn': ['umap-learn', 'dirty-cat==0.2.0', 'scikit-learn>=1.0'],
 }
 # https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed
 base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib']
 
-base_extras = {**base_extras_light, **base_extras_heavy}
+
+base_extras = {**base_extras_light, **base_extras_heavy, **base_extras_data}
 
 extras_require = {
 
   **base_extras_light,
   **base_extras_heavy,
   **dev_extras,
+  **base_extras_data,
 
   #kitchen sink for users -- not recommended
   'all': unique_flatten_dict(base_extras),
@@ -63,6 +72,8 @@ def unique_flatten_dict(d):
   #kitchen sink for contributors, skips ai
   'dev': unique_flatten_dict(base_extras_light) + unique_flatten_dict(dev_extras),
 
+  #for people data synthesizer
+  'data': unique_flatten_dict(base_extras_data),
 }
 
 setup(

From d5ea8a0b86b43851c80da2ac2578740b1d36d462 Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:18:28 -0400
Subject: [PATCH 6/9] git mv and fix setup.py

---
 .../scripts/generator/{PersonGen.py => PersonGenerator.py}  | 0
 setup.py                                                    | 6 ++----
 2 files changed, 2 insertions(+), 4 deletions(-)
 rename demos/data/scripts/generator/{PersonGen.py => PersonGenerator.py} (100%)

diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGenerator.py
similarity index 100%
rename from demos/data/scripts/generator/PersonGen.py
rename to demos/data/scripts/generator/PersonGenerator.py
diff --git a/setup.py b/setup.py
index 425f68eb4d..5f4eea678d 100755
--- a/setup.py
+++ b/setup.py
@@ -44,12 +44,10 @@ def unique_flatten_dict(d):
 }
 
 base_extras_data = {
-    'names-dataset': ['names-dataset'],
-    'Faker': ['Faker'],
-    'random-address': ['random-address'],
-    'phone-gen': ['phone-gen'],
+    'data-gen': ['names-dataset', 'faker', 'random-address','phone-gen']
 }
 
+
 base_extras_heavy = {
   'umap-learn': ['umap-learn', 'dirty-cat==0.2.0', 'scikit-learn>=1.0'],
 }

From 3eb89fb69903200d81a9ff0adbeb5c922fdad1ba Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Mon, 8 Apr 2024 13:47:34 -0400
Subject: [PATCH 7/9] minor fixes

---
 demos/data/scripts/generator/PersonGenerator.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/demos/data/scripts/generator/PersonGenerator.py b/demos/data/scripts/generator/PersonGenerator.py
index d91def8ad7..11a86d1ffa 100644
--- a/demos/data/scripts/generator/PersonGenerator.py
+++ b/demos/data/scripts/generator/PersonGenerator.py
@@ -88,22 +88,17 @@ def generate_people(
             }
             record["email_address"] = record["first_name"] + record["last_name"] + str(self.random.randint(0, 999)) + self.random.choice(self.domains)
             records.append(record)
-        
+
         df = pd.DataFrame(records)
         return df
 
     def get_address(self) -> dict:
-        # Placeholder for your existing `self.address()` method
         return real_random_address.RandomAddress()
 
     def get_address_by_state(self, state: str) -> dict:
-        # Placeholder for generating an address by state
-        # Implement actual functionality here
         return real_random_address.real_random_address_by_state(state)
 
-
     def get_address_by_postal_code(self, postal_code: str) -> dict:
-        # Placeholder for generating an address by postal code
         return real_random_address.real_random_address_by_postal_code(postal_code)
 
     def generate_addresses(

From 9ab127cefc14cc55c0e4057c9fa4260e6c30e23e Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:16:50 -0400
Subject: [PATCH 8/9] fix real random address import

---
 demos/data/scripts/generator/PersonGenerator.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/demos/data/scripts/generator/PersonGenerator.py b/demos/data/scripts/generator/PersonGenerator.py
index 11a86d1ffa..26bf21c419 100644
--- a/demos/data/scripts/generator/PersonGenerator.py
+++ b/demos/data/scripts/generator/PersonGenerator.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import random
 from faker import Faker
-from random_address import real_random_address
+import random_address
 from phone_gen import PhoneNumber
 from datetime import timedelta
 from datetime import datetime
@@ -51,6 +51,7 @@ def __init__(
         self.random = random.Random(self.seed)
         self.phone = PhoneNumber(self.country)
         self.names = NameDataset()
+        self.address = random_address
         self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country]
         self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country]
 
@@ -93,13 +94,13 @@ def generate_people(
         return df
 
     def get_address(self) -> dict:
-        return real_random_address.RandomAddress()
+        return self.address.real_random_address()
 
     def get_address_by_state(self, state: str) -> dict:
-        return real_random_address.real_random_address_by_state(state)
+        return self.address.real_random_address_by_state(state)
 
     def get_address_by_postal_code(self, postal_code: str) -> dict:
-        return real_random_address.real_random_address_by_postal_code(postal_code)
+        return self.address.real_random_address_by_postal_code(postal_code)
 
     def generate_addresses(
             self,

From d0483f6bb040e3abbaeaff70125ee68585be25a1 Mon Sep 17 00:00:00 2001
From: webcoderz <19884161+webcoderz@users.noreply.github.com>
Date: Mon, 17 Jun 2024 10:55:16 -0400
Subject: [PATCH 9/9] updating the crime network generator

seperated the profile generation out into a seperate module that is now a faker factory
---
 .../generator/CrimeNetworkGenerator.py        | 698 ++++++++++++++++++
 .../data/scripts/generator/PersonGenerator.py | 434 -----------
 .../scripts/generator/ProfileGenerator.py     |  89 +++
 setup.py                                      |   2 +-
 4 files changed, 788 insertions(+), 435 deletions(-)
 create mode 100644 demos/data/scripts/generator/CrimeNetworkGenerator.py
 delete mode 100644 demos/data/scripts/generator/PersonGenerator.py
 create mode 100644 demos/data/scripts/generator/ProfileGenerator.py

diff --git a/demos/data/scripts/generator/CrimeNetworkGenerator.py b/demos/data/scripts/generator/CrimeNetworkGenerator.py
new file mode 100644
index 0000000000..aaa06a9130
--- /dev/null
+++ b/demos/data/scripts/generator/CrimeNetworkGenerator.py
@@ -0,0 +1,698 @@
+import pandas as pd
+import numpy as np
+from sklearn.datasets import make_blobs
+import factory.random
+from datetime import datetime, timedelta
+from ProfileGenerator import ProfileFactory
+from scipy.spatial import cKDTree
+from itertools import count
+import graphistry
+
+
+class PersonNetworkGenerator:
+    def __init__(
+            self,
+            n_kingpins: int = 4,
+            dealers_per_kingpin: int = 5,
+            users_per_dealer: int = 3,
+            dealer_normal_connections: int = 4,
+            kingpin_normal_connections: int = 3,
+            within_group_connections: int = 4,
+            random_connections: int = 3,
+            max_calls_per_edge: int = 11,
+            affiliations: list = ['Gang Alpha', 'Cartel Beta', 'Gang Gamma', 'Cartel Delta'],
+            crimes: list = [
+                        "Armed Robbery",
+                        "Burglary",
+                        "Drug Trafficking",
+                        "Vandalism",
+                        "Assault",
+                        "Money Laundering",
+                        "Fraud",
+                        "Homicide",
+                    ],
+            max_crimes_per_case: int = 3,
+            max_cases_per_person: int = 3,
+            n_normal: int = 1000,
+            postal_code: int = None,
+            state: str = None,
+            call_start_date: str = "2022-1-1",
+            call_end_date: str = "2023-12-31",
+            max_num_whereabouts: int = 4,
+            leader_to_leader_call_chance: float = 0.05,
+            shared_case_percentage: float = 0.3,
+            ):
+
+        self.n_kingpins = n_kingpins
+        self.dealers_per_kingpin = dealers_per_kingpin
+        self.users_per_dealer = users_per_dealer
+        self.dealer_normal_connections = dealer_normal_connections
+        self.kingpin_normal_connections = kingpin_normal_connections
+        self.within_group_connections = within_group_connections
+        self.random_connections = random_connections
+        self.n_normal = n_normal
+        self.node_df = None
+        self.edge_df = None
+        self.labels = None
+        self.seed = 42
+        np.random.seed(self.seed)
+        factory.random.reseed_random(self.seed)
+        self.postal_code = postal_code
+        self.state = state
+        self.affiliations = affiliations
+        self.crimes = crimes
+        self.max_crimes_per_case = max_crimes_per_case
+        self.max_cases_per_person = max_cases_per_person
+        self.max_calls_per_edge = max_calls_per_edge
+        self.call_start_date = call_start_date
+        self.call_end_date = call_end_date
+        self.max_num_whereabouts = max_num_whereabouts
+        self.leader_to_leader_call_chance = leader_to_leader_call_chance
+        self.shared_case_percentage = shared_case_percentage
+
+    #NETWORK GENERATION
+    def generate_network(self):
+        # Generate clusters for kingpins, dealers, and users
+        X_kingpins, _ = make_blobs(
+            n_samples=self.n_kingpins,
+            centers=self.n_kingpins,
+            cluster_std=1.0,
+            random_state=self.seed
+            )
+
+        X_dealers, _ = make_blobs(
+            n_samples=self.dealers_per_kingpin * self.n_kingpins,
+            centers=X_kingpins,
+            cluster_std=2.5,
+            random_state=self.seed
+            )
+
+        X_users, _ = make_blobs(
+            n_samples=self.users_per_dealer * self.dealers_per_kingpin * self.n_kingpins,
+            centers=X_dealers,
+            cluster_std=3.5,
+            random_state=self.seed
+            )
+
+        X_normal = np.random.rand(self.n_normal, 2) * 100  # Normal people data
+
+        # Combine all data
+        points = np.vstack([X_kingpins, X_dealers, X_users, X_normal])
+        self.labels = ['kingpin']*self.n_kingpins + \
+                      ['dealer']*self.dealers_per_kingpin*self.n_kingpins + \
+                      ['user']*self.dealers_per_kingpin*self.n_kingpins*self.users_per_dealer + \
+                      ['normal']*self.n_normal
+
+        # Create DataFrame for nodes
+        self.node_df = pd.DataFrame(points, columns=['x', 'y'])
+        self.node_df['node_id'] = range(len(self.node_df))
+        self.node_df['type'] = self.labels
+
+        # Assign personal details
+        self.assign_personal_details(self.postal_code, self.state, self.max_num_whereabouts)
+
+        # assign affiliations
+        self.assign_affiliations()
+
+        # Generate criminal records
+        self.generate_and_assign_criminal_records()
+
+        # Generate edges
+        self.generate_edges()
+
+        # Generate call logs
+        self.generate_and_assign_call_logs(self.call_start_date, self.call_end_date)
+
+    def calculate_nearest_kingpin(self):
+        # Extract coordinates for kingpins and dealers
+        kingpin_coords = self.node_df[self.node_df['type'] == 'kingpin'][['x', 'y']].to_numpy()
+        dealer_coords = self.node_df[self.node_df['type'] == 'dealer'][['x', 'y']].to_numpy()
+
+        # Find nearest kingpin index for each dealer
+        nearest_kingpin_indices = self.find_nearest_kingpin_index(dealer_coords, kingpin_coords)
+
+        # Map nearest kingpin indices back to the original DataFrame indices of kingpins
+        kingpin_df_indices = self.node_df[self.node_df['type'] == 'kingpin'].index.to_numpy()
+        mapped_kingpin_indices = kingpin_df_indices[nearest_kingpin_indices]
+
+        # Assign the mapped kingpin indices to dealers in the DataFrame
+        self.node_df.loc[self.node_df['type'] == 'dealer', 'nearest_kingpin_index'] = mapped_kingpin_indices
+
+    def find_nearest_kingpin_index(
+            self,
+            dealer_coords: np.array,
+            kingpin_coords: np.array
+            ) -> np.array:
+        # Create a KD-tree for kingpin locations
+        tree = cKDTree(kingpin_coords)
+
+        # Query the tree for the nearest kingpin to each dealer
+        # 'query' returns a tuple where the first element is the distance
+        # and the second element is the index of the nearest kingpin in the tree
+        _, nearest_kingpin_indices = tree.query(dealer_coords, k=1)
+
+        return nearest_kingpin_indices
+
+    def ensure_kingpin_dealer_connectivity(self) -> list:
+        edge_list = []
+        kingpins = self.node_df[self.node_df['type'] == 'kingpin']
+        for kingpin_index in kingpins.index:
+            affiliated_dealers = self.node_df[(self.node_df['type'] == 'dealer') & (self.node_df['affiliation'] == self.node_df.at[kingpin_index, 'affiliation'])].index
+            # Ensure each kingpin has connections to dealers
+            if not affiliated_dealers.empty:
+                selected_dealers = np.random.choice(affiliated_dealers, size=min(3, len(affiliated_dealers)), replace=False)
+                for dealer_index in selected_dealers:
+                    edge_list.append((kingpin_index, dealer_index))
+        return edge_list
+
+    def connect_dealers_to_users(self) -> list:
+        edge_list = []
+        dealers = self.node_df[self.node_df['type'] == 'dealer']
+        users = self.node_df[self.node_df['type'] == 'user'].index
+        for dealer_index in dealers.index:
+            # Select a random number of users to connect with each dealer
+            num_connections = self.users_per_dealer  # For example, each dealer connects with 2 to 4 users
+            selected_users = np.random.choice(users, size=num_connections, replace=False)
+            for user_index in selected_users:
+                edge_list.append((dealer_index, user_index))
+        return edge_list
+
+    def connect_within_group(self) -> list:
+        # Exclude kingpins and normal individuals for within-group connections
+        group_nodes = self.node_df[~self.node_df['type'].isin(['kingpin', 'normal'])]
+
+        # Group by affiliation and type
+        grouped = group_nodes.groupby(['affiliation', 'type'])
+
+        # Initialize an empty list to store edges
+        edge_list = []
+
+        # Iterate over each group
+        for name, group in grouped:
+            # Generate connections for each node in the group
+            for node_index in group.index:
+                # Identify potential connections within the same affiliation and type
+                potential_connections = group.index[group.index != node_index]
+                # Randomly select a subset for connections
+                num_connections = np.random.randint(1, self.within_group_connections)  # Adjust numbers as needed
+                if not potential_connections.empty:
+                    selected_connections = np.random.choice(potential_connections, size=min(len(potential_connections), num_connections), replace=False)
+                    # Add connections to the edge list
+                    edge_list.extend([(node_index, connection) for connection in selected_connections])
+
+        return edge_list
+
+    def connect_randomly(self) -> list:
+        # Decide randomly if a node should form random connections
+        nodes_to_connect = self.node_df.index[np.random.rand(len(self.node_df)) < 0.1]
+
+        # Function to generate random connections for a node
+        def generate_random_connections(node):
+            # Exclude self-connections
+            potential_connections = self.node_df.index[self.node_df.index != node]
+            num_connections = np.random.randint(1, self.random_connections)  # Adjust numbers as needed
+            selected_connections = np.random.choice(potential_connections, size=min(len(potential_connections), num_connections), replace=False)
+            return [(node, connection) for connection in selected_connections]
+
+        # Generate random connections for each selected node
+        edge_list = [edge for node in nodes_to_connect for edge in generate_random_connections(node)]
+
+        return edge_list
+
+    def connect_to_normals(self) -> list:
+        # Define which roles should have connections to normal individuals
+        roles_with_normal_connections = ['kingpin', 'dealer']
+
+        # Filter the DataFrame for normal individuals
+        normal_people = self.node_df[self.node_df['type'] == 'normal'].index
+
+        # Filter the DataFrame for nodes that should have connections to normal individuals
+        nodes_to_connect = self.node_df[self.node_df['type'].isin(roles_with_normal_connections)]
+
+        # Generate connections for each node
+        connections = nodes_to_connect.apply(lambda row: self.generate_normal_connections(row, normal_people), axis=1)
+
+        # Flatten the list of connections
+        edge_list = [item for sublist in connections for item in sublist]
+
+        return edge_list
+
+    def generate_normal_connections(
+            self,
+            node_row: pd.DataFrame,
+            normal_people: pd.DataFrame.index
+            ) -> list:
+
+        # Determine the number of normal connections (e.g., 1-3 for kingpins, 1-2 for dealers)
+        if node_row['type'] == 'kingpin':
+            num_connections = np.random.randint(1, self.kingpin_normal_connections)  # Kingpins have 1 to 3 normal connections
+        else:  # Dealers
+            num_connections = np.random.randint(1, self.dealer_normal_connections)  # Dealers have 1 to 10 normal connections
+
+        # Select random normal individuals to connect with
+        selected_normals = np.random.choice(normal_people, size=num_connections, replace=False)
+
+        # Return a list of connections for the node
+        return [(node_row.name, normal_index) for normal_index in selected_normals]
+
+    def generate_edges(self):
+        edge_list = []
+
+        # Initial connections based on affiliations and roles
+        # Ensure kingpin-dealer connectivity and dealer-user connections
+        edge_list.extend(self.ensure_kingpin_dealer_connectivity())
+
+        edge_list.extend(self.connect_dealers_to_users())
+
+        # Within-group connections
+        edge_list.extend(self.connect_within_group())
+
+        # Random connections across the network
+        edge_list.extend(self.connect_randomly())
+
+        # Connect kingpins and dealers to normal individuals
+        edge_list.extend(self.connect_to_normals())
+
+        # Convert edge list to DataFrame
+        self.edge_df = pd.DataFrame(edge_list, columns=['src', 'target'])
+
+    def assign_personal_details(
+            self,
+            postal_code: str,
+            state: str,
+            max_num_whereabouts: int
+            ) -> None:
+
+        details_df = self.generate_details(
+                num_records=len(self.node_df),
+                postal_code=postal_code,
+                state=state,
+                num_whereabouts=max_num_whereabouts
+                )
+        self.node_df = pd.concat([self.node_df, details_df], axis=1)
+        return self.expand_whereabouts_to_columns()
+
+    def flatten_dict(self, d: dict) -> dict:
+        items = []
+        for key, value in d.items():
+            if isinstance(value, dict):
+                items.extend(self.flatten_dict(value).items())
+            else:
+                items.append((key, value))
+        return dict(items)
+
+    #PROFILE GENERATION
+    def generate_details(
+            self,
+            num_records: int,
+            postal_code: str,
+            state: str,
+            num_whereabouts: int
+            ) -> pd.DataFrame:
+
+        return pd.DataFrame([self.flatten_dict(profile.to_dict()) for profile in ProfileFactory.create_batch(num_records, postal_code=postal_code, state=state, num_whereabouts=num_whereabouts)])
+
+    def expand_whereabouts_to_columns(self):
+        max_whereabouts = self.max_num_whereabouts
+
+        # Create a temporary DataFrame from the 'whereabouts' series
+        whereabouts_df = self.node_df['whereabouts'].apply(pd.Series)
+
+        # Iterate over the number of whereabouts
+        for i in range(max_whereabouts):
+            # Extract whereabouts details for each whereabouts
+            whereabouts_details_df = whereabouts_df[i].apply(pd.Series)
+
+            # Assign address, from_date, to_date, and other details to the node DataFrame
+            self.node_df[f'whereabouts_{i+1}_address1'] = whereabouts_details_df['address1']
+            self.node_df[f'whereabouts_{i+1}_address2'] = whereabouts_details_df['address2']
+            self.node_df[f'whereabouts_{i+1}_city'] = whereabouts_details_df['city']
+            self.node_df[f'whereabouts_{i+1}_state'] = whereabouts_details_df['state']
+            self.node_df[f'whereabouts_{i+1}_postalCode'] = whereabouts_details_df['postalCode']
+            self.node_df[f'whereabouts_{i+1}_coordinates'] = whereabouts_details_df['coordinates']
+            # Flatten coordinates into lat and lng
+            coordinates_df = whereabouts_details_df['coordinates'].apply(pd.Series)
+            self.node_df[f'whereabouts_{i+1}_lat'] = coordinates_df['lat']
+            self.node_df[f'whereabouts_{i+1}_lng'] = coordinates_df['lng']
+            # Drop the coordinates column
+            self.node_df.drop(f'whereabouts_{i+1}_coordinates', axis=1, inplace=True)
+
+            self.node_df[f'whereabouts_{i+1}_from_date'] = whereabouts_details_df['from_date']
+            self.node_df[f'whereabouts_{i+1}_to_date'] = whereabouts_details_df['to_date']
+
+
+        # Drop the original 'whereabouts' column
+        self.node_df.drop('whereabouts', axis=1, inplace=True)
+        # Replace NaN values with None
+        self.node_df = self.node_df.where(pd.notnull(self.node_df), None)
+
+    @staticmethod
+    def random_datetime(
+        year: int,
+        month: int,
+        day: int,
+        hour_start: int,
+        hour_end: int
+    ) -> datetime:
+
+        start = datetime(year, month, day, hour_start)
+        end = datetime(year, month, day, hour_end)
+        return start + timedelta(
+            seconds=np.random.randint(0, int((end - start).total_seconds()))
+            )
+
+    def assign_affiliations(self):
+        # Step 1: Assign an affiliation to each kingpin
+        kingpins = self.node_df[self.node_df['type'] == 'kingpin']
+
+        shuffled_affiliations = np.random.choice(
+            self.affiliations,
+            size=len(self.affiliations),
+            replace=False
+            ).tolist()
+
+        for i, index in enumerate(kingpins.index):
+            if i < len(shuffled_affiliations):
+                # Assign a unique affiliation to each kingpin
+                self.node_df.at[index, 'affiliation'] = shuffled_affiliations[i]
+            else:
+                # If there are more kingpins than affiliations, assign random affiliations to the remaining kingpins
+                self.node_df.at[index, 'affiliation'] = np.random.choice(self.affiliations)
+
+        # Step 2: Calculate nearest kingpin for dealers and assign affiliations
+        self.calculate_nearest_kingpin()
+        # Ensure dealers inherit their kingpin's affiliation
+        self.node_df.loc[self.node_df['type'] == 'dealer', 'affiliation'] = self.node_df.loc[self.node_df['type'] == 'dealer', 'nearest_kingpin_index'].map(lambda x: self.node_df.at[x, 'affiliation'])
+
+        # Step 3: Assign 'None' to users and normal individuals
+        self.node_df.loc[self.node_df['type'].isin(['user', 'normal']), 'affiliation'] = 'None'
+
+    def generate_and_assign_criminal_records(self):
+        unique_case_number = count(start=1000, step=1)  # Unique case number generator
+        gang_related_cases = {}  # To track gang-related case numbers and crimes
+
+        # Generate number of cases for each person
+        self.node_df['num_cases'] = np.random.randint(
+            0,
+            self.max_cases_per_person + 1,
+            size=len(self.node_df)
+            )
+
+        # Generate cases for each person
+        self.node_df['cases'] = self.node_df.apply(
+            lambda row: [
+                self.generate_case(
+                    row,
+                    gang_related_cases,
+                    unique_case_number
+                )
+                for _ in range(row['num_cases'])
+                ],
+            axis=1
+            )
+
+        # Drop the 'num_cases' column as it's no longer needed
+        self.node_df.drop('num_cases', axis=1, inplace=True)
+        return self.expand_cases_to_columns()
+
+    def generate_case(
+            self,
+            person: pd.DataFrame,
+            gang_related_cases: dict,
+            unique_case_number: int
+            ) -> dict:
+        # Adjusted logic for determining shared or unique cases
+        if person['affiliation'] != 'None' and gang_related_cases.get(person['affiliation']) and np.random.random() < self.shared_case_percentage:
+            shared_case = np.random.choice(gang_related_cases[person['affiliation']])
+            return shared_case
+        else:
+            case_num = next(unique_case_number)
+            crimes_in_case = np.random.choice(
+                self.crimes,
+                np.random.randint(1, 4),
+                replace=False
+                ).tolist()
+
+            new_case = {"case_number": case_num, "crimes": crimes_in_case}
+
+            if person['affiliation'] != 'None':
+                gang_related_cases.setdefault(
+                    person['affiliation'],
+                    []
+                ).append(new_case)
+
+            return new_case
+
+    def expand_cases_to_columns(self):
+        max_crimes_per_case = self.max_crimes_per_case  # Adjust based on your dataset
+
+        # Create a temporary DataFrame from the 'cases' series
+        cases_df = self.node_df['cases'].apply(pd.Series)
+
+        # Iterate over the number of cases
+        for i in range(max_crimes_per_case):
+            # Extract case details for each case
+            case_details_df = cases_df[i].apply(pd.Series)
+
+            # Assign case number and crimes to the node DataFrame
+            self.node_df[f'case_number_{i+1}'] = case_details_df['case_number'].astype('Int64')
+            self.node_df[f'case_number_{i+1}'] = self.node_df[f'case_number_{i+1}'].astype('object')
+
+            # Extract crimes for each case and assign to the node DataFrame
+            crimes_df = case_details_df['crimes'].apply(pd.Series)
+            for j in range(max_crimes_per_case):
+                self.node_df[f'crime_{i+1}_{j+1}'] = crimes_df[j]
+
+        # Drop the original 'cases' column
+        self.node_df.drop('cases', axis=1, inplace=True)
+        # Replace NaN values with None
+        self.node_df = self.node_df.where(pd.notnull(self.node_df), None)
+
+    #CALL LOG GENERATION
+    def generate_phone_numbers(self):
+        # Assuming self.node_df exists and has been populated
+        self.teledict = self.node_df['phone'].to_dict()
+
+    def generate_and_assign_call_logs(self, start_date, end_date):
+        # Parse date strings
+        start_date = datetime.strptime(start_date, '%Y-%m-%d') \
+            if isinstance(start_date, str) else start_date
+
+        end_date = datetime.strptime(end_date, '%Y-%m-%d') \
+            if isinstance(end_date, str) else end_date
+
+        # Ensure phone numbers are generated
+        if not hasattr(self, 'teledict'):
+            self.generate_phone_numbers()
+
+        # Define a function to generate call logs for a given edge
+        def generate_call_logs(edge: dict) -> list:
+            # Check if the edge exists in self.edge_df
+            if edge['src'] not in self.node_df.index or edge['target'] not in self.node_df.index:
+                # If the edge doesn't exist, manually set the caller and callee types to 'kingpin'
+                caller_type = 'kingpin'
+                callee_type = 'kingpin'
+            else:
+                # If the edge does exist, get the caller and callee types from self.node_df
+                caller_type = self.node_df.loc[edge['src'], 'type']
+                callee_type = self.node_df.loc[edge['target'], 'type']
+
+            # Determine the number of calls for this edge (e.g., 1-10)
+            num_calls = np.random.randint(1, self.max_calls_per_edge)
+
+            # Assign caller and callee phone numbers
+            caller = self.teledict[edge['src']]
+            callee = self.teledict[edge['target']]
+            # Determine the number of calls for this edge (e.g., 1-10)
+
+            # Check if the call is inter-gang (caller is a kingpin and callee is a dealer from diff gang)
+            if caller_type == 'kingpin' and callee_type == 'dealer' and self.node_df.loc[edge['src'], 'affiliation'] != self.node_df.loc[edge['target'], 'affiliation']:
+                call_type = 'inter-gang'
+            # Check if the call is inter-gang (both nodes are kingpins from different gangs)
+            elif caller_type == 'kingpin' and callee_type == 'kingpin' and self.node_df.loc[edge['src'], 'affiliation'] != self.node_df.loc[edge['target'], 'affiliation']:
+                call_type = 'inter-gang'
+            # Check if the call is intra-gang (caller is a kingpin and callee is a dealer from the same gang)
+            elif caller_type == 'kingpin' and callee_type == 'dealer' and self.node_df.loc[edge['src'], 'affiliation'] == self.node_df.loc[edge['target'], 'affiliation']:
+                call_type = 'intra-gang'
+            #dealer to dealer intra-gang
+            elif caller_type == 'dealer' and callee_type == 'dealer' and self.node_df.loc[edge['src'], 'affiliation'] == self.node_df.loc[edge['target'], 'affiliation']:
+                call_type = 'intra-gang'
+            # All other calls are non-affiliated
+            else:
+                call_type = 'non-affiliated'
+
+            # Return a list of call logs for this edge
+            return [{
+                'src': edge['src'],
+                'target': edge['target'],
+                'caller': caller,
+                'callee': callee,
+                'call_time': self.random_datetime(
+                    year=start_date.year + np.random.randint(0, (end_date - start_date).days // 365),
+                    month=np.random.randint(1, 13),
+                    day=np.random.randint(1, 29),
+                    hour_start=0 if caller_type in ['user', 'normal'] else 8,
+                    hour_end=23 if caller_type in ['user', 'normal'] else 22
+                ).strftime('%Y-%m-%d %H:%M:%S'),
+                'duration_minutes': np.random.randint(5, 61) if caller_type in ['user', 'normal'] else np.random.randint(1, 16),
+                'call_type': call_type
+            } for _ in range(num_calls)]
+
+        # Generate call logs for each edge
+        call_logs = self.edge_df.apply(generate_call_logs, axis=1).tolist()
+
+        # Generate inter-gang calls between kingpins
+        kingpins = self.node_df[self.node_df['type'] == 'kingpin']
+        kingpin_calls = []
+
+        for i in range(len(kingpins)):
+            for j in range(i + 1, len(kingpins)):
+                if kingpins.iloc[i]['affiliation'] != kingpins.iloc[j]['affiliation'] and np.random.random() < self.leader_to_leader_call_chance:  # 5% chance of a call
+                    edge = {'src': kingpins.index[i], 'target': kingpins.index[j]}  # Use index here
+                    kingpin_calls.append(generate_call_logs(edge))
+
+        kg_calls = pd.DataFrame(kingpin_calls)
+        call_logs_df = pd.DataFrame(call_logs)
+        # Flatten the DataFrame
+        flattened_df = pd.json_normalize(
+            call_logs_df.apply(lambda x: x.tolist(), axis=1)
+            .explode()
+            .dropna()
+            .tolist()
+            )
+
+        flattened_king_df = pd.json_normalize(
+            kg_calls.apply(lambda x: x.tolist(), axis=1)
+            .explode()
+            .dropna()
+            .tolist()
+            )
+
+        # Drop rows and columns that are entirely NaN
+        flattened_df = flattened_df \
+            .dropna(axis=0, how='all') \
+            .dropna(axis=1, how='all')
+
+        flattened_king_df = flattened_king_df \
+            .dropna(axis=0, how='all') \
+            .dropna(axis=1, how='all')
+
+        # Assign the flattened DataFrame to self.edge_df
+        self.edge_df = pd.concat([flattened_king_df, flattened_df])
+
+    def to_graph(
+            self,
+            size_dict: dict = None,
+            edge_influence: int = 7,
+            icon_mapping: dict = None,
+            color_mapping: dict = None
+            ) -> graphistry.plotter.Plotter:
+
+        ndf = self.node_df.copy()
+        edf = self.edge_df.copy()
+
+        edge_counts = edf.groupby(['src', 'target', 'call_type']) \
+            .size() \
+            .reset_index(name='weight')
+
+        # Default size_dict if none is provided
+        if size_dict is None:
+            size_dict = {'kingpin': 200, 'dealer': 75, 'user': 50, 'normal': 25}
+
+        ndf['size'] = ndf['type'].map(size_dict)
+
+        # Default icon_mapping if none is provided
+        if icon_mapping is None:
+            icon_mapping = {
+                'kingpin': 'user-o',
+                'dealer': 'user-md',
+                'user': 'users',
+                'normal': 'universal-access',
+            }
+
+        # Default color_mapping if none is provided
+        if color_mapping is None:
+            color_mapping = {
+                'non-affiliated': 'blue',
+                'intra-gang': 'red',
+                'inter-gang': 'orange'
+                }
+
+        g = (
+            graphistry.nodes(ndf, 'node_id')
+            .edges(edge_counts, 'src', 'target')
+            .bind(point_title='type', point_size='size')
+            .bind(edge_weight="weight", edge_color="call_type")
+            .settings(url_params={'edgeInfluence': edge_influence})
+            .encode_point_icon('type', categorical_mapping=icon_mapping)
+            .encode_edge_color(
+                'call_type',
+                categorical_mapping=color_mapping,
+                default_mapping='#CCC'
+                )
+        )
+
+        return g
+
+    def get_dealer_to_user_edges_and_nodes(
+            self,
+            affiliated_nodes: pd.DataFrame
+            ) -> tuple:
+        # Filter the node DataFrame to only include dealers
+        affiliated_dealers = affiliated_nodes[affiliated_nodes['type'] == 'dealer']
+
+        # Join the edges and nodes dataframes on the 'target' column
+        edges_with_node_types = self.edge_df.merge(self.node_df[['node_id', 'type']], left_on='target', right_on='node_id', how='left')
+
+        # Filter the joined dataframe to only include edges from dealers to users
+        dealer_to_user_edges_df = edges_with_node_types[(edges_with_node_types['src'].isin(affiliated_dealers['node_id'])) & (edges_with_node_types['type'] == 'user')]
+
+        # Create the dealer to user edges
+        dealer_to_user_edges = dealer_to_user_edges_df[['src', 'target']].copy()
+        dealer_to_user_edges['role'] = 'user'
+        dealer_to_user_edges['affiliation'] = dealer_to_user_edges['src'].map(affiliated_dealers['affiliation'])
+
+        # Get the user nodes
+        user_nodes = self.node_df[self.node_df['node_id'].isin(dealer_to_user_edges['target'])]
+
+        return dealer_to_user_edges, user_nodes
+
+    def to_tree(self, affiliation: str) -> graphistry.plotter.Plotter:
+        # Filter the node DataFrame by the specified affiliation
+        affiliated_nodes = self.node_df[self.node_df['affiliation'] == affiliation].copy()
+        affiliated_nodes.loc[:, "node_label"] = affiliated_nodes["first_name"] + " " + affiliated_nodes["last_name"]
+
+        dealer_to_user_edges, user_nodes = self.get_dealer_to_user_edges_and_nodes(affiliated_nodes)
+
+        user_nodes = pd.DataFrame(user_nodes)
+        user_nodes.loc[:, "node_label"] = user_nodes["first_name"] + " " + user_nodes["last_name"]
+
+        # Get the kingpin node
+        kingpin_node = affiliated_nodes[affiliated_nodes['type'] == 'kingpin']['node_id'].values[0]
+
+        # Add dealer nodes and edges to the dataframes based on the affiliations
+        dealer_nodes = affiliated_nodes[affiliated_nodes['type'] == 'dealer']
+        dealer_edges = pd.DataFrame({
+            'src': kingpin_node,
+            'target': dealer_nodes['node_id'],
+            'role': dealer_nodes['type'],
+            'affiliation': dealer_nodes['affiliation']
+        })
+
+        # Add dealer to user edges to the new_edges DataFrame
+        new_edges = pd.concat([dealer_edges, dealer_to_user_edges])
+
+        # Add user nodes to the new_nodes DataFrame
+        new_nodes = pd.concat([affiliated_nodes, user_nodes])
+
+        g = graphistry.bind(
+            source='src',
+            destination='target',
+            node='node_id',
+            point_title='node_label'
+            ).edges(new_edges).nodes(new_nodes)
+        g = g.encode_point_color('type', categorical_mapping={'kingpin': 'red', 'dealer': 'blue', 'user': 'green'}, default_mapping='gray')
+        g = g.encode_point_icon('type', categorical_mapping={'kingpin': 'user-o', 'dealer': 'user-md', 'user': 'users'})
+        g = g.settings(url_params={'play': 0, "edgeCurvature": 0.0})
+        g = g.tree_layout(width=100, height=50)
+        return g
\ No newline at end of file
diff --git a/demos/data/scripts/generator/PersonGenerator.py b/demos/data/scripts/generator/PersonGenerator.py
deleted file mode 100644
index 26bf21c419..0000000000
--- a/demos/data/scripts/generator/PersonGenerator.py
+++ /dev/null
@@ -1,434 +0,0 @@
-import pandas as pd
-import random
-from faker import Faker
-import random_address
-from phone_gen import PhoneNumber
-from datetime import timedelta
-from datetime import datetime
-import numpy as np
-from itertools import count
-from names_dataset import NameDataset
-
-
-class PersonGenerator:
-
-    def __init__(
-            self,
-            seed: int = 0,
-            country: str = 'US',
-            people_amt: int = 100,
-            affiliations: list = ['Gang Alpha', 'Cartel Beta', 'Gang Gamma', 'Cartel Delta'],
-            crimes: list = [
-                        "Armed Robbery",
-                        "Burglary",
-                        "Drug Trafficking",
-                        "Vandalism",
-                        "Assault",
-                        "Money Laundering",
-                        "Fraud",
-                        "Homicide",
-                    ]
-            ):
-
-        """
-        PersonGenerator:
-        A class to generate synthetic person records;
-        including basic information, addresses, call logs, affiliations, and criminal records.
-        
-        :param seed: Seed for random number generation.
-        :param country: Country code for phone number generation.
-        :param people_amt: Number of people to generate.
-        :param affiliations: List of affiliations to generate.
-        :param crimes: List of crimes to generate.
-        """
-
-        self.seed = seed
-        self.country = country
-        self.people_num = people_amt
-        Faker.seed(self.seed)
-        np.random.seed(self.seed)
-        self.fake = Faker()
-        self.random = random.Random(self.seed)
-        self.phone = PhoneNumber(self.country)
-        self.names = NameDataset()
-        self.address = random_address
-        self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country]
-        self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country]
-
-        self.domains = pd.read_csv("domains.txt", header=None)[0].to_list()
-        self.affiliations = affiliations
-        self.crimes = crimes
-
-    def generate_people(
-            self,
-            num_records: int = 100,
-            min_age: int = 15,
-            max_age: int = 85
-            ) -> pd.DataFrame:
-
-        """
-        Generate a set of people records with basic information.
-        :param num_records: Number of records to generate.
-        :param min_age: Minimum age for date of birth generation.
-        :param max_age: Maximum age for date of birth generation.
-        :return: DataFrame of people records.
-        """
-        records = []
-        for _ in range(num_records):
-            gender = ["M", "F"]
-            sex = self.random.choice(gender)
-            record = {
-                "first_name": self.random.choice(self.first_names[sex]),
-                "last_name": self.random.choice(self.last_names),
-                "phone_number": self.phone.get_number(full=False),
-                "sex": sex,
-                "DOB": self.fake.date_of_birth(
-                                        minimum_age=min_age,
-                                        maximum_age=max_age
-                                        ),
-            }
-            record["email_address"] = record["first_name"] + record["last_name"] + str(self.random.randint(0, 999)) + self.random.choice(self.domains)
-            records.append(record)
-
-        df = pd.DataFrame(records)
-        return df
-
-    def get_address(self) -> dict:
-        return self.address.real_random_address()
-
-    def get_address_by_state(self, state: str) -> dict:
-        return self.address.real_random_address_by_state(state)
-
-    def get_address_by_postal_code(self, postal_code: str) -> dict:
-        return self.address.real_random_address_by_postal_code(postal_code)
-
-    def generate_addresses(
-            self,
-            num_records: int = 100,
-            start_date: str = "-30y",
-            end_date: str = "today",
-            state: str = None,
-            postal_code: str = None
-            ) -> pd.DataFrame:
-        """
-        Generate addresses for a set of people, simulating a history of addresses.
-        :param num_records: Number of addresses to generate.
-        :param start_date: Start date for address history.
-        :param end_date: End date for address history.
-        :param state: Optional state to generate addresses for.
-        :param postal_code: Optional postal code to generate addresses for.
-        :return: DataFrame of addresses.
-        :raises ValueError: If both state and postal_code are provided.
-        """
-
-        if state and postal_code:
-            raise ValueError("Cannot specify both state and postal code. Please choose one.")
-
-        records = []
-        for _ in range(num_records):
-            if state:
-                address = self.get_address_by_state(state)
-            elif postal_code:
-                address = self.get_address_by_postal_code(postal_code)
-            else:
-                address = self.get_address()
-
-            record = {
-                "address1": address.get('address1', ''),
-                "address2": address.get('address2', ''),
-                "city": address.get("city", "Unknown City"),
-                "date": self.fake.date_between(start_date=start_date, end_date=end_date),
-                "state": address.get("state", "Unknown State"),
-                "zip": address.get("postalCode", "Unknown PostalCode"),
-                "lat": address.get("coordinates", {}).get("lat", 0.0),
-                "lon": address.get("coordinates", {}).get("lng", 0.0)
-            }
-            records.append(record)
-        df = pd.DataFrame(records)
-        return df
-
-    def generate_call_logs(
-            self,
-            people_df: pd.DataFrame,
-            num_logs: int = 500,
-            start_date: str = '-1y'
-            ) -> pd.DataFrame:
-        """
-        Generate call logs for a set of people, simulating everyday calls.
-        :param people_df: DataFrame of people with affiliations.
-        :param num_logs: Number of call logs to generate.
-        :param start_date: Start date for call logs.
-        :return: DataFrame of call logs.
-        """
-        call_logs = []
-        phone_numbers = people_df['phone_number'].tolist()
-        
-        for _ in range(num_logs):
-            caller, callee = self.random.sample(phone_numbers, 2)  # Ensure caller and callee are different
-            call_date = self.fake.date_time_between(start_date=start_date)
-            call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59))
-            duration = self.random.randint(1, 3600)  # Call duration in seconds, from 1 sec to 1 hour
-            
-            call_logs.append({
-                "caller": caller,
-                "callee": callee,
-                "call_date": call_date.strftime('%Y-%m-%d'),
-                "call_time": call_time.strftime('%H:%M:%S'),
-                "duration_sec": duration
-            })
-        
-        return pd.DataFrame(call_logs)
-
-    def generate_non_affiliated_call_logs(
-            self,
-            people_df: pd.DataFrame,
-            call_logs_df: pd.DataFrame,
-            num_calls: int = 500,
-            start_date: str = '-1y'
-            ) -> pd.DataFrame:
-        """
-        Generate call logs for non-affiliated individuals, simulating everyday calls.
-
-        :param people_df: DataFrame of people with affiliations.
-        :param call_logs_df: DataFrame of call logs to append to.
-        :param num_calls: Number of calls to generate among non-affiliated individuals.
-        :return: Updated DataFrame with non-affiliated call logs.
-        """
-        # Filter for non-affiliated individuals
-        non_affiliated_people = people_df[people_df['affiliation'] == 'None']
-        
-        # Generate call logs
-        for _ in range(num_calls):
-            if len(non_affiliated_people) > 1:
-                caller, callee = non_affiliated_people.sample(n=2, replace=False)['phone_number'].values
-                self.add_call_log(call_logs_df, caller, callee, start_date)
-
-        return call_logs_df
-
-    def generate_affiliated_call_logs(
-            self,
-            people_df: pd.DataFrame,
-            call_logs_df: pd.DataFrame,
-            num_affiliated_calls: int = 100,
-            leader_call_percentage: float = 0.05,
-            start_date: str = '-1y'
-            ) -> pd.DataFrame:
-        """
-        Generate call logs with a focus on gang affiliations, including both intra-gang and inter-gang communications.
-        
-        :param people_df: DataFrame of people with affiliations.
-        :param call_logs_df: Existing DataFrame of call logs to append to.
-        :param num_affiliated_calls: Number of additional affiliated calls to generate.
-        :param leader_call_percentage: Percentage of calls that should be between gang leaders (inter-gang calls).
-        :return: Updated DataFrame with affiliated call logs.
-        """
-        affiliated_people = people_df[people_df['affiliation'] != 'None']
-        affiliated_groups = affiliated_people['affiliation'].unique()
-        
-        leader_calls = int(num_affiliated_calls * leader_call_percentage)
-        gang_calls = num_affiliated_calls - leader_calls
-
-        # Generate intra-gang calls
-        for _ in range(gang_calls):
-            gang = self.random.choice(affiliated_groups)
-            gang_members = affiliated_people[affiliated_people['affiliation'] == gang]
-            
-            if len(gang_members) > 1:
-                caller, callee = gang_members.sample(n=2, replace=False)['phone_number'].values
-                self.add_call_log(call_logs_df, caller, callee, start_date)
-        
-        # Generate inter-gang calls (leader calls)
-        for _ in range(leader_calls):
-            gangs = self.random.sample(list(affiliated_groups), 2)
-            for gang in gangs:
-                gang_leader = affiliated_people[affiliated_people['affiliation'] == gang].sample(n=1)['phone_number'].values[0]
-                if gang == gangs[0]:
-                    caller = gang_leader
-                else:
-                    callee = gang_leader
-            self.add_call_log(call_logs_df, caller, callee, start_date)
-
-        return call_logs_df
-
-    def add_call_log(
-            self,
-            call_logs_df: pd.DataFrame,
-            caller: str,
-            callee: str,
-            start_date: str
-            ) -> pd.DataFrame:
-        """
-        Helper function to add a call log entry to the DataFrame.
-        """
-        call_date = self.fake.date_time_between(start_date=start_date)
-        call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59))
-        duration = self.random.randint(1, 3600)  # Duration in seconds, from 1 sec to 1 hour
-        
-        new_entry = pd.DataFrame([{
-            "caller": caller,
-            "callee": callee,
-            "call_date": call_date.strftime('%Y-%m-%d'),
-            "call_time": call_time.strftime('%H:%M:%S'),
-            "duration_sec": duration
-        }])
-
-        return pd.concat([call_logs_df, new_entry], ignore_index=True)
-    
-    def generate_affiliations(
-            self,
-            people_df: pd.DataFrame,
-            percentage_affiliated: float = 0.1,
-            lambda_param: float = 1.5
-            ) -> pd.DataFrame:
-        """
-        Generate affiliations for a subset of the provided DataFrame of people.
-        
-        :param people_df: DataFrame of people.
-        :param percentage_affiliated: Approximate percentage of people to have affiliations.
-        :param lambda_param: Lambda parameter for the exponential distribution, controlling affiliation spread.
-        :return: Updated DataFrame with an 'affiliation' column.
-        """
-        num_people = len(people_df)
-        num_affiliated = int(num_people * percentage_affiliated)
-        
-        # Determine number of people affiliated with each group, ensuring sum equals num_affiliated
-        affiliation_counts = np.random.exponential(lambda_param, len(self.affiliations))
-        affiliation_counts = np.round((affiliation_counts / affiliation_counts.sum()) * num_affiliated).astype(int)
-        
-        # Adjust in case rounding errors cause a mismatch in total counts
-        while affiliation_counts.sum() != num_affiliated:
-            if affiliation_counts.sum() > num_affiliated:
-                affiliation_counts[np.argmax(affiliation_counts)] -= 1
-            else:
-                affiliation_counts[np.argmin(affiliation_counts)] += 1
-        # Assign affiliations to randomly selected people
-        people_df['affiliation'] = 'None'
-        already_selected = set()
-        for count, affiliation in zip(affiliation_counts, self.affiliations):
-            eligible_indices = [i for i in range(num_people) if i not in already_selected]
-            selected_indices = self.random.sample(eligible_indices, count)
-            people_df.loc[selected_indices, 'affiliation'] = affiliation
-            already_selected.update(selected_indices)
-
-        return people_df
-
-    def assign_whereabouts_to_people(
-            self,
-            people_df: pd.DataFrame,
-            addresses_df: pd.DataFrame,
-            percent_cohabitating: float = 0.2
-            ) -> pd.DataFrame:
-        """
-        Assign addresses to people, ensuring that gang-affiliated individuals may share addresses.
-
-        :param people_df: DataFrame of people.
-        :param addresses_df: DataFrame of addresses.
-        :param percent_cohabitating: Percentage of gang-affiliated individuals who share addresses.
-        :return: Updated DataFrame with address details.
-        """
-
-        # Initially, each person gets a unique address by default (if enough addresses)
-        if len(addresses_df) >= len(people_df):
-            people_df = pd.concat([people_df, addresses_df.sample(len(people_df)).reset_index(drop=True)], axis=1)
-        else:
-            raise ValueError("Not enough addresses to assign to each person uniquely.")
-
-        # Identify gang-affiliated individuals for potential cohabitation
-        affiliated_groups = people_df[people_df['affiliation'] != 'None']['affiliation'].unique()
-
-        for gang in affiliated_groups:
-            gang_members = people_df[people_df['affiliation'] == gang]
-            # Decide on how many addresses to group gang members at (e.g., 20% of gang members share addresses)
-            num_addresses = int(len(gang_members) * percent_cohabitating)
-            shared_addresses = addresses_df.sample(num_addresses)
-
-            for idx, address in shared_addresses.iterrows():
-                # Randomly select gang members to live together
-                members_to_live_together = gang_members.sample(n=2 if len(gang_members) > 1 else 1)  # At least 2 if possible
-                for _, member in members_to_live_together.iterrows():
-                    people_df.loc[member.name, ['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']] = address[['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']]
-
-                # Remove the selected members to avoid reselection
-                gang_members = gang_members.drop(members_to_live_together.index)
-
-        return people_df
-
-    def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame:
-        """
-        Helper function to expand the 'cases' column into multiple columns for case details.
-
-        :param people_df: DataFrame of people with 'cases' column.
-        :return: Updated DataFrame with expanded case details.
-        """
-
-        # Create columns for case details
-        max_crimes_per_case = 3  # Adjust based on your dataset
-        for i in range(max_crimes_per_case):
-            people_df[f'case_number_{i+1}'] = None
-            for j in range(max_crimes_per_case):
-                people_df[f'crime_{i+1}_{j+1}'] = None
-
-        for index, row in people_df.iterrows():
-            for i, case in enumerate(row['cases']):
-                if i < max_crimes_per_case:
-                    people_df.at[index, f'case_number_{i+1}'] = case['case_number']
-                    for j, crime in enumerate(case['crimes']):
-                        if j < max_crimes_per_case:
-                            people_df.at[index, f'crime_{i+1}_{j+1}'] = crime
-
-        # Drop the original 'cases' column if no longer needed
-        # people_df.drop('cases', axis=1, inplace=True)
-
-        return people_df
-
-    def generate_and_assign_criminal_records(
-            self,
-            people_df: pd.DataFrame,
-            max_cases_per_person: int = 3
-            ) -> pd.DataFrame:
-        
-        """
-        Generate criminal records for a subset of the provided DataFrame of people.
-
-        :param people_df: DataFrame of people with affiliations.
-        :param max_cases_per_person: Maximum number of cases to generate for each person.
-        :return: Updated DataFrame with criminal records.
-        """
-        unique_case_number = count(start=1000, step=1)  # Unique case number generator
-        criminal_records = []  # To collect criminal record entries
-        gang_related_cases = {}  # To track gang-related case numbers and crimes
-
-        for index, person in people_df.iterrows():
-            num_cases = self.random.randint(0, max_cases_per_person)  # Decide how many cases, if any
-            records_for_person = {"person_id": index, "cases": []}
-
-            for _ in range(num_cases):
-                # Determine if this case is shared (for gang members) or unique
-                if person['affiliation'] != 'None' and gang_related_cases.get(person['affiliation']) and self.random.random() < 0.3:
-                    # Share an existing case
-                    shared_case = self.random.choice(gang_related_cases[person['affiliation']])
-                    records_for_person["cases"].append(shared_case)
-                else:
-                    # Create a new case with 1 or more crimes
-                    case_num = next(unique_case_number)
-                    crimes_in_case = self.random.sample(self.crimes, self.random.randint(1, min(3, len(self.crimes))))  # Up to 3 crimes per case, adjust as needed
-                    new_case = {"case_number": case_num, "crimes": crimes_in_case}
-                    records_for_person["cases"].append(new_case)
-                    
-                    # If gang-affiliated, add this case to the gang's record for potential sharing
-                    if person['affiliation'] != 'None':
-                        if person['affiliation'] not in gang_related_cases:
-                            gang_related_cases[person['affiliation']] = []
-                        gang_related_cases[person['affiliation']].append(new_case)
-
-            criminal_records.append(records_for_person)
-
-        # Convert to DataFrame and merge
-        criminal_records_df = pd.DataFrame(criminal_records)
-        people_df = pd.merge(people_df, criminal_records_df, how='left', left_index=True, right_on='person_id')
-        people_df.drop('person_id', axis=1, inplace=True)
-
-        # Handle individuals with no criminal records
-        people_df['cases'] = people_df['cases'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else [])
-
-        return self.expand_cases_to_columns(people_df)
\ No newline at end of file
diff --git a/demos/data/scripts/generator/ProfileGenerator.py b/demos/data/scripts/generator/ProfileGenerator.py
new file mode 100644
index 0000000000..b866f9f58c
--- /dev/null
+++ b/demos/data/scripts/generator/ProfileGenerator.py
@@ -0,0 +1,89 @@
+import factory
+from datetime import datetime, timedelta
+import pandas as pd
+
+import numpy as np
+import random_address
+
+
+class Profile:
+    def __init__(
+            self,
+            firstname,
+            lastname,
+            phone_number,
+            username, email,
+            address,
+            dob,
+            whereabouts,
+            num_whereabouts=None,
+            postal_code=None,
+            state=None,
+            rand_num=None
+            ):
+
+        self.postal_code = postal_code
+        self.state = state
+        self.username = username
+        self.email = email
+        self.firstname = firstname
+        self.lastname = lastname
+        self.phone_number = phone_number
+        self.address = address
+        self.DOB = dob
+        self.whereabouts = whereabouts
+        self.rand_num = rand_num
+        self.num_whereabouts = num_whereabouts
+
+    def to_dict(self):
+        return {"first_name": self.firstname,
+                "last_name": self.lastname,
+                "user_name": self.username,
+                "DOB": self.DOB,
+                "email": self.email,
+                "phone": self.phone_number,
+                "address": self.address,
+                "whereabouts": self.whereabouts
+                }
+
+    def __str__(self):
+        return str(self.__dict__)
+
+
+#profile factory
+class ProfileFactory(factory.Factory):
+    class Meta:
+        model = Profile
+    # Optional parameters for address generation
+    state = None
+    postal_code = None
+    num_whereabouts = None
+    rand_num = factory.LazyFunction(lambda: str(np.random.randint(0, 999)))
+    username = factory.LazyAttribute(lambda obj: f"{obj.firstname}.{obj.lastname}{obj.rand_num}".lower())
+    email = factory.LazyAttribute(lambda obj: f"{obj.firstname}.{obj.lastname}@{str(obj.rand_num) + np.random.choice(pd.read_csv('domains.txt', header=None)[0].to_list())}".lower())
+    dob = factory.LazyFunction(lambda: (datetime.today() - timedelta(days=np.random.randint(15 * 365, 85 * 365))).strftime('%m-%d-%Y'))
+    firstname = factory.Faker('first_name')
+    lastname = factory.Faker('last_name')
+    phone_number = factory.Faker('basic_phone_number', locale="en_US")
+    address = factory.LazyAttribute(lambda obj: ProfileFactory.generate_address(state=obj.state, postal_code=obj.postal_code, from_date=(datetime.today() - timedelta(days=np.random.randint(0, 365))).strftime('%m-%d-%Y'), to_date=datetime.today().strftime('%m-%d-%Y')))
+    whereabouts = factory.LazyAttribute(lambda obj: [ProfileFactory.generate_address(state=obj.state, postal_code=obj.postal_code, from_date=(datetime.today() - timedelta(days=np.random.randint(365, 365 * 5))).strftime('%m-%d-%Y'), to_date=(datetime.today() - timedelta(days=np.random.randint(0, 365))).strftime('%m-%d-%Y')) for _ in range(obj.num_whereabouts)])
+    
+    @staticmethod
+    def generate_address(state=None, postal_code=None, from_date=None, to_date=None) -> dict:
+        """
+        Function to generate an address.
+        """
+        if state and postal_code:
+            raise ValueError("Cannot specify both state and postal code. Please choose one.")
+        elif state:
+            address = random_address.real_random_address_by_state(state)
+        elif postal_code:
+            address = random_address.real_random_address_by_postal_code(postal_code)
+        else:
+            address = random_address.real_random_address()
+
+        # Add dates to address
+        address['from_date'] = from_date
+        address['to_date'] = to_date
+
+        return address
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 5f4eea678d..7964a7dbd4 100755
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@ def unique_flatten_dict(d):
 }
 
 base_extras_data = {
-    'data-gen': ['names-dataset', 'faker', 'random-address','phone-gen']
+    'data-gen': ['random-address', 'factory_boy']
 }