From aa377aca70fc89f11bf8c12932ac865dd01f83c7 Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Tue, 26 Mar 2024 12:52:27 -0400 Subject: [PATCH 1/9] had to edit the gitignore to get the generator and domains.txt to commit --- .gitignore | 22 +- demos/data/scripts/generator/PersonGen.py | 355 ++++++++++++++++++++++ demos/data/scripts/generator/domains.txt | 98 ++++++ 3 files changed, 474 insertions(+), 1 deletion(-) create mode 100644 demos/data/scripts/generator/PersonGen.py create mode 100644 demos/data/scripts/generator/domains.txt diff --git a/.gitignore b/.gitignore index f8a1ee9544..7fe7d02743 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,27 @@ lib64/ parts/ sdist/ var/ -data/ +#data/ +data/benchmarking/ +data/img/ +data/appearances.txt +data/characters.txt +data/comics.txt +data/facebook_combined.txt +data/honeypot.csv +data/lesmiserables.csv +data/samplegraph.json +data/transactions.csv +data/twitterDemo.csv +data/demos_by_use_case/ +data/demos_databases_apis +data/gfql/ +data/more_examples/ +data/talks/ +data/for_analysis.ipynb +data/for_developers.ipynb +data/upload_csv_miniapp.ipynb + *.egg-info/ .installed.cfg *.egg diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py new file mode 100644 index 0000000000..0a676827a2 --- /dev/null +++ b/demos/data/scripts/generator/PersonGen.py @@ -0,0 +1,355 @@ +import pandas as pd +import random +from faker import Faker +from random_address import real_random_address +from phone_gen import PhoneNumber +from datetime import timedelta +from datetime import datetime +import numpy as np +from itertools import count +from names_dataset import NameDataset + + +class PersonGenerator: + + def __init__( + self, + seed: int = 0, + country: str = 'US', + people_amt: int = 100, + affiliations: list = ['Gang Alpha', 'Cartel Beta', 'Gang Gamma', 'Cartel Delta'], + crimes: list = [ + "Armed Robbery", + "Burglary", + "Drug Trafficking", + "Vandalism", + "Assault", + "Money Laundering", + "Fraud", + "Homicide", + ] + ): + + self.seed = seed + self.country = country + self.people_num = people_amt + Faker.seed(self.seed) + np.random.seed(self.seed) + self.fake = Faker() + self.random = random.Random(self.seed) + self.phone = PhoneNumber(self.country) + self.names = NameDataset() + self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country] + self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country] + self.address = real_random_address.RandomAddress() + self.domains = pd.read_csv("domains.txt", header=None)[0].to_list() + self.affiliations = affiliations + self.crimes = crimes + + def generate_people( + self, + num_records: int = 100, + min_age: int = 15, + max_age: int = 85 + ) -> pd.DataFrame: + + records = [] + for _ in range(num_records): + gender = ["M", "F"] + sex = self.random.choice(gender) + record = { + "first_name": self.random.choice(self.first_names[sex]), + "last_name": self.random.choice(self.last_names), + "phone_number": self.phone.get_number(full=False), + "sex": sex, + "DOB": self.fake.date_of_birth( + minimum_age=min_age, + maximum_age=max_age + ), + } + record["email_address"] = record["first_name"] + record["last_name"] + str(self.random.randint(0, 999)) + self.random.choice(self.domains) + records.append(record) + + df = pd.DataFrame(records) + return df + + def generate_addresses( + self, + num_records: int = 100, + start_date: str = "-30y", + end_date: str = "today" + ) -> pd.DataFrame: + + records = [] + for _ in range(num_records): + address = self.address() + + record = { + "address1": address.get('address1', ''), + "address2": address.get('address2', ''), + "city": address.get("city", "Unknown City"), + "date": self.fake.date_between(start_date=start_date, end_date=end_date), + "state": address.get("state", "Unknown State"), + "zip": address.get("postalCode", "Unknown PostalCode"), + "lat": address.get("coordinates", {}).get("lat", 0.0), + "lon": address.get("coordinates", {}).get("lng", 0.0) + } + records.append(record) + df = pd.DataFrame(records) + return df + + def generate_call_logs( + self, + people_df: pd.DataFrame, + num_logs: int = 500, + start_date: str = '-1y' + ) -> pd.DataFrame: + + call_logs = [] + phone_numbers = people_df['phone_number'].tolist() + + for _ in range(num_logs): + caller, callee = self.random.sample(phone_numbers, 2) # Ensure caller and callee are different + call_date = self.fake.date_time_between(start_date=start_date) + call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59)) + duration = self.random.randint(1, 3600) # Call duration in seconds, from 1 sec to 1 hour + + call_logs.append({ + "caller": caller, + "callee": callee, + "call_date": call_date.strftime('%Y-%m-%d'), + "call_time": call_time.strftime('%H:%M:%S'), + "duration_sec": duration + }) + + return pd.DataFrame(call_logs) + + def generate_non_affiliated_call_logs( + self, + people_df: pd.DataFrame, + call_logs_df: pd.DataFrame, + num_calls: int = 500, + start_date: str = '-1y' + ) -> pd.DataFrame: + """ + Generate call logs for non-affiliated individuals, simulating everyday calls. + + :param people_df: DataFrame of people with affiliations. + :param call_logs_df: DataFrame of call logs to append to. + :param num_calls: Number of calls to generate among non-affiliated individuals. + :return: Updated DataFrame with non-affiliated call logs. + """ + # Filter for non-affiliated individuals + non_affiliated_people = people_df[people_df['affiliation'] == 'None'] + + # Generate call logs + for _ in range(num_calls): + if len(non_affiliated_people) > 1: + caller, callee = non_affiliated_people.sample(n=2, replace=False)['phone_number'].values + self.add_call_log(call_logs_df, caller, callee, start_date) + + return call_logs_df + + def generate_affiliated_call_logs( + self, + people_df: pd.DataFrame, + call_logs_df: pd.DataFrame, + num_affiliated_calls: int = 100, + leader_call_percentage: float = 0.05, + start_date: str = '-1y' + ) -> pd.DataFrame: + """ + Generate call logs with a focus on gang affiliations, including both intra-gang and inter-gang communications. + + :param people_df: DataFrame of people with affiliations. + :param call_logs_df: Existing DataFrame of call logs to append to. + :param num_affiliated_calls: Number of additional affiliated calls to generate. + :param leader_call_percentage: Percentage of calls that should be between gang leaders (inter-gang calls). + :return: Updated DataFrame with affiliated call logs. + """ + affiliated_people = people_df[people_df['affiliation'] != 'None'] + affiliated_groups = affiliated_people['affiliation'].unique() + + leader_calls = int(num_affiliated_calls * leader_call_percentage) + gang_calls = num_affiliated_calls - leader_calls + + # Generate intra-gang calls + for _ in range(gang_calls): + gang = self.random.choice(affiliated_groups) + gang_members = affiliated_people[affiliated_people['affiliation'] == gang] + + if len(gang_members) > 1: + caller, callee = gang_members.sample(n=2, replace=False)['phone_number'].values + self.add_call_log(call_logs_df, caller, callee, start_date) + + # Generate inter-gang calls (leader calls) + for _ in range(leader_calls): + gangs = self.random.sample(list(affiliated_groups), 2) + for gang in gangs: + gang_leader = affiliated_people[affiliated_people['affiliation'] == gang].sample(n=1)['phone_number'].values[0] + if gang == gangs[0]: + caller = gang_leader + else: + callee = gang_leader + self.add_call_log(call_logs_df, caller, callee, start_date) + + return call_logs_df + + def add_call_log( + self, + call_logs_df: pd.DataFrame, + caller: str, + callee: str, + start_date: str + ) -> pd.DataFrame: + """ + Helper function to add a call log entry to the DataFrame. + """ + call_date = self.fake.date_time_between(start_date=start_date) + call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59)) + duration = self.random.randint(1, 3600) # Duration in seconds, from 1 sec to 1 hour + + new_entry = pd.DataFrame([{ + "caller": caller, + "callee": callee, + "call_date": call_date.strftime('%Y-%m-%d'), + "call_time": call_time.strftime('%H:%M:%S'), + "duration_sec": duration + }]) + + return pd.concat([call_logs_df, new_entry], ignore_index=True) + + def generate_affiliations( + self, + people_df: pd.DataFrame, + percentage_affiliated: float = 0.1, + lambda_param: float = 1.5 + ) -> pd.DataFrame: + """ + Generate affiliations for a subset of the provided DataFrame of people. + + :param people_df: DataFrame of people. + :param percentage_affiliated: Approximate percentage of people to have affiliations. + :param lambda_param: Lambda parameter for the exponential distribution, controlling affiliation spread. + :return: Updated DataFrame with an 'affiliation' column. + """ + num_people = len(people_df) + num_affiliated = int(num_people * percentage_affiliated) + + # Determine number of people affiliated with each group, ensuring sum equals num_affiliated + affiliation_counts = np.random.exponential(lambda_param, len(self.affiliations)) + affiliation_counts = np.round((affiliation_counts / affiliation_counts.sum()) * num_affiliated).astype(int) + + # Adjust in case rounding errors cause a mismatch in total counts + while affiliation_counts.sum() != num_affiliated: + if affiliation_counts.sum() > num_affiliated: + affiliation_counts[np.argmax(affiliation_counts)] -= 1 + else: + affiliation_counts[np.argmin(affiliation_counts)] += 1 + # Assign affiliations to randomly selected people + people_df['affiliation'] = 'None' + already_selected = set() + for count, affiliation in zip(affiliation_counts, self.affiliations): + eligible_indices = [i for i in range(num_people) if i not in already_selected] + selected_indices = self.random.sample(eligible_indices, count) + people_df.loc[selected_indices, 'affiliation'] = affiliation + already_selected.update(selected_indices) + + return people_df + + def assign_whereabouts_to_people( + self, + people_df: pd.DataFrame, + addresses_df: pd.DataFrame, + percent_cohabitating: float = 0.2 + ) -> pd.DataFrame: + # Initially, each person gets a unique address by default (if enough addresses) + if len(addresses_df) >= len(people_df): + people_df = pd.concat([people_df, addresses_df.sample(len(people_df)).reset_index(drop=True)], axis=1) + else: + raise ValueError("Not enough addresses to assign to each person uniquely.") + + # Identify gang-affiliated individuals for potential cohabitation + affiliated_groups = people_df[people_df['affiliation'] != 'None']['affiliation'].unique() + + for gang in affiliated_groups: + gang_members = people_df[people_df['affiliation'] == gang] + # Decide on how many addresses to group gang members at (e.g., 20% of gang members share addresses) + num_addresses = int(len(gang_members) * percent_cohabitating) + shared_addresses = addresses_df.sample(num_addresses) + + for idx, address in shared_addresses.iterrows(): + # Randomly select gang members to live together + members_to_live_together = gang_members.sample(n=2 if len(gang_members) > 1 else 1) # At least 2 if possible + for _, member in members_to_live_together.iterrows(): + people_df.loc[member.name, ['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']] = address[['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']] + + # Remove the selected members to avoid reselection + gang_members = gang_members.drop(members_to_live_together.index) + + return people_df + + def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame: + # Create columns for case details + max_crimes_per_case = 3 # Adjust based on your dataset + for i in range(max_crimes_per_case): + people_df[f'case_number_{i+1}'] = None + for j in range(max_crimes_per_case): + people_df[f'crime_{i+1}_{j+1}'] = None + + for index, row in people_df.iterrows(): + for i, case in enumerate(row['cases']): + if i < max_crimes_per_case: + people_df.at[index, f'case_number_{i+1}'] = case['case_number'] + for j, crime in enumerate(case['crimes']): + if j < max_crimes_per_case: + people_df.at[index, f'crime_{i+1}_{j+1}'] = crime + + # Drop the original 'cases' column if no longer needed + # people_df.drop('cases', axis=1, inplace=True) + + return people_df + + def generate_and_assign_criminal_records( + self, + people_df: pd.DataFrame, + max_cases_per_person: int = 3 + ) -> pd.DataFrame: + unique_case_number = count(start=1000, step=1) # Unique case number generator + criminal_records = [] # To collect criminal record entries + gang_related_cases = {} # To track gang-related case numbers and crimes + + for index, person in people_df.iterrows(): + num_cases = self.random.randint(0, max_cases_per_person) # Decide how many cases, if any + records_for_person = {"person_id": index, "cases": []} + + for _ in range(num_cases): + # Determine if this case is shared (for gang members) or unique + if person['affiliation'] != 'None' and gang_related_cases.get(person['affiliation']) and self.random.random() < 0.3: + # Share an existing case + shared_case = self.random.choice(gang_related_cases[person['affiliation']]) + records_for_person["cases"].append(shared_case) + else: + # Create a new case with 1 or more crimes + case_num = next(unique_case_number) + crimes_in_case = self.random.sample(self.crimes, self.random.randint(1, min(3, len(self.crimes)))) # Up to 3 crimes per case, adjust as needed + new_case = {"case_number": case_num, "crimes": crimes_in_case} + records_for_person["cases"].append(new_case) + + # If gang-affiliated, add this case to the gang's record for potential sharing + if person['affiliation'] != 'None': + if person['affiliation'] not in gang_related_cases: + gang_related_cases[person['affiliation']] = [] + gang_related_cases[person['affiliation']].append(new_case) + + criminal_records.append(records_for_person) + + # Convert to DataFrame and merge + criminal_records_df = pd.DataFrame(criminal_records) + people_df = pd.merge(people_df, criminal_records_df, how='left', left_index=True, right_on='person_id') + people_df.drop('person_id', axis=1, inplace=True) + + # Handle individuals with no criminal records + people_df['cases'] = people_df['cases'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else []) + + return self.expand_cases_to_columns(people_df) \ No newline at end of file diff --git a/demos/data/scripts/generator/domains.txt b/demos/data/scripts/generator/domains.txt new file mode 100644 index 0000000000..b728c28628 --- /dev/null +++ b/demos/data/scripts/generator/domains.txt @@ -0,0 +1,98 @@ +@gmail.com +@yahoo.com +@hotmail.com +@aol.com +@hotmail.co.uk +@hotmail.fr +@msn.com +@yahoo.fr +@wanadoo.fr +@orange.fr +@comcast.net +@yahoo.co.uk +@yahoo.com.br +@yahoo.co.i +@live.com +@rediffmail.com +@free.fr +@gmx.de +@web.de +@yandex.ru +@ymail.com +@libero.it +@outlook.com +@uol.com.br +@bol.com.br +@mail.ru +@cox.net +@hotmail.it +@sbcglobal.net +@sfr.fr +@live.fr +@verizon.net +@live.co.uk +@googlemail.co +@yahoo.eu +@ig.com.br +@live.nl +@bigpond.com +@terra.com.br +@yahoo.itdomains +@alice.it +@rocketmail.com +@att.net +@laposte.net +@facebook.com +@bellsouth.net +@yahoo.in +@hotmail.es +@charter.net +@yahoo.ca +@yahoo.com.au +@rambler.ru +@hotmail.de +@tiscali.i +@shaw.co +@yahoo.co.jp +@sky.co +@earthlink.net +@optonline.net +@freenet.de +@t-online.de +@aliceadsl.fr +@virgilio.it +@home.nl +@qq.com +@telenet.be +@me.com +@yahoo.com.ar +@tiscali.co.uk +@yahoo.com.mx +@voila.fr +@gmx.net +@mail.com +@planet.nl +@tin.it +@live.it +@ntlworld.com +@arcor.de +@yahoo.co.id +@frontiernet.net +@hetnet.nl +@live.com.au +@yahoo.com.sg +@zonnet.nl +@club-internet.fr +@juno.com +@optusnet.com.au +@blueyonder.co.uk +@bluewin.ch +@skynet.be +@sympatico.ca +@windstream.net +@mac.com +@centurytel.net +@chello.nl +@live.ca +@aim.com +@bigpond.net.au From bec3343c6355a891c344dfd5c419220a024dc35a Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:02:06 -0400 Subject: [PATCH 2/9] adding doctrings --- demos/data/scripts/generator/PersonGen.py | 50 +++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py index 0a676827a2..34ec25f3c0 100644 --- a/demos/data/scripts/generator/PersonGen.py +++ b/demos/data/scripts/generator/PersonGen.py @@ -53,6 +53,13 @@ def generate_people( max_age: int = 85 ) -> pd.DataFrame: + """ + Generate a set of people records with basic information. + :param num_records: Number of records to generate. + :param min_age: Minimum age for date of birth generation. + :param max_age: Maximum age for date of birth generation. + :return: DataFrame of people records. + """ records = [] for _ in range(num_records): gender = ["M", "F"] @@ -80,6 +87,13 @@ def generate_addresses( end_date: str = "today" ) -> pd.DataFrame: + """ + Generate addresses for a set of people, simulating a history of addresses. + :param num_records: Number of addresses to generate. + :param start_date: Start date for address history. + :param end_date: End date for address history. + :return: DataFrame of addresses. + """ records = [] for _ in range(num_records): address = self.address() @@ -104,7 +118,13 @@ def generate_call_logs( num_logs: int = 500, start_date: str = '-1y' ) -> pd.DataFrame: - + """ + Generate call logs for a set of people, simulating everyday calls. + :param people_df: DataFrame of people with affiliations. + :param num_logs: Number of call logs to generate. + :param start_date: Start date for call logs. + :return: DataFrame of call logs. + """ call_logs = [] phone_numbers = people_df['phone_number'].tolist() @@ -263,6 +283,15 @@ def assign_whereabouts_to_people( addresses_df: pd.DataFrame, percent_cohabitating: float = 0.2 ) -> pd.DataFrame: + """ + Assign addresses to people, ensuring that gang-affiliated individuals may share addresses. + + :param people_df: DataFrame of people. + :param addresses_df: DataFrame of addresses. + :param percent_cohabitating: Percentage of gang-affiliated individuals who share addresses. + :return: Updated DataFrame with address details. + """ + # Initially, each person gets a unique address by default (if enough addresses) if len(addresses_df) >= len(people_df): people_df = pd.concat([people_df, addresses_df.sample(len(people_df)).reset_index(drop=True)], axis=1) @@ -288,8 +317,15 @@ def assign_whereabouts_to_people( gang_members = gang_members.drop(members_to_live_together.index) return people_df - + def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame: + """ + Helper function to expand the 'cases' column into multiple columns for case details. + + :param people_df: DataFrame of people with 'cases' column. + :return: Updated DataFrame with expanded case details. + """ + # Create columns for case details max_crimes_per_case = 3 # Adjust based on your dataset for i in range(max_crimes_per_case): @@ -307,7 +343,7 @@ def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame: # Drop the original 'cases' column if no longer needed # people_df.drop('cases', axis=1, inplace=True) - + return people_df def generate_and_assign_criminal_records( @@ -315,6 +351,14 @@ def generate_and_assign_criminal_records( people_df: pd.DataFrame, max_cases_per_person: int = 3 ) -> pd.DataFrame: + + """ + Generate criminal records for a subset of the provided DataFrame of people. + + :param people_df: DataFrame of people with affiliations. + :param max_cases_per_person: Maximum number of cases to generate for each person. + :return: Updated DataFrame with criminal records. + """ unique_case_number = count(start=1000, step=1) # Unique case number generator criminal_records = [] # To collect criminal record entries gang_related_cases = {} # To track gang-related case numbers and crimes From 2f564bee64f0c57e2c5818d02bf3db7ea17c0609 Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:19:16 -0400 Subject: [PATCH 3/9] adding state and zipcode address generation --- demos/data/scripts/generator/PersonGen.py | 35 ++++++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py index 34ec25f3c0..d7227b09e3 100644 --- a/demos/data/scripts/generator/PersonGen.py +++ b/demos/data/scripts/generator/PersonGen.py @@ -41,7 +41,7 @@ def __init__( self.names = NameDataset() self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country] self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country] - self.address = real_random_address.RandomAddress() + self.domains = pd.read_csv("domains.txt", header=None)[0].to_list() self.affiliations = affiliations self.crimes = crimes @@ -80,23 +80,50 @@ def generate_people( df = pd.DataFrame(records) return df + def get_address(self) -> dict: + # Placeholder for your existing `self.address()` method + return real_random_address.RandomAddress() + + def get_address_by_state(self, state: str) -> dict: + # Placeholder for generating an address by state + # Implement actual functionality here + return real_random_address.real_random_address_by_state(state) + + + def get_address_by_postal_code(self, postal_code: str) -> dict: + # Placeholder for generating an address by postal code + return real_random_address.real_random_address_by_postal_code(postal_code) + def generate_addresses( self, num_records: int = 100, start_date: str = "-30y", - end_date: str = "today" + end_date: str = "today", + state: str = None, + postal_code: str = None ) -> pd.DataFrame: - """ Generate addresses for a set of people, simulating a history of addresses. :param num_records: Number of addresses to generate. :param start_date: Start date for address history. :param end_date: End date for address history. + :param state: Optional state to generate addresses for. + :param postal_code: Optional postal code to generate addresses for. :return: DataFrame of addresses. + :raises ValueError: If both state and postal_code are provided. """ + + if state and postal_code: + raise ValueError("Cannot specify both state and postal code. Please choose one.") + records = [] for _ in range(num_records): - address = self.address() + if state: + address = self.get_address_by_state(state) + elif postal_code: + address = self.get_address_by_postal_code(postal_code) + else: + address = self.get_address() record = { "address1": address.get('address1', ''), From 9919a12db26326e61c3221185d3f6c23fff3ef59 Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:25:26 -0400 Subject: [PATCH 4/9] adding class doctring --- demos/data/scripts/generator/PersonGen.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGen.py index d7227b09e3..d91def8ad7 100644 --- a/demos/data/scripts/generator/PersonGen.py +++ b/demos/data/scripts/generator/PersonGen.py @@ -30,6 +30,18 @@ def __init__( ] ): + """ + PersonGenerator: + A class to generate synthetic person records; + including basic information, addresses, call logs, affiliations, and criminal records. + + :param seed: Seed for random number generation. + :param country: Country code for phone number generation. + :param people_amt: Number of people to generate. + :param affiliations: List of affiliations to generate. + :param crimes: List of crimes to generate. + """ + self.seed = seed self.country = country self.people_num = people_amt From e180e9d40004af43301e015d450f876cd4c9b8ff Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:50:49 -0400 Subject: [PATCH 5/9] adding data synth libs to setup.py --- setup.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c81db1b09c..425f68eb4d 100755 --- a/setup.py +++ b/setup.py @@ -43,19 +43,28 @@ def unique_flatten_dict(d): 'jupyter': ['ipython'], } +base_extras_data = { + 'names-dataset': ['names-dataset'], + 'Faker': ['Faker'], + 'random-address': ['random-address'], + 'phone-gen': ['phone-gen'], +} + base_extras_heavy = { 'umap-learn': ['umap-learn', 'dirty-cat==0.2.0', 'scikit-learn>=1.0'], } # https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib'] -base_extras = {**base_extras_light, **base_extras_heavy} + +base_extras = {**base_extras_light, **base_extras_heavy, **base_extras_data} extras_require = { **base_extras_light, **base_extras_heavy, **dev_extras, + **base_extras_data, #kitchen sink for users -- not recommended 'all': unique_flatten_dict(base_extras), @@ -63,6 +72,8 @@ def unique_flatten_dict(d): #kitchen sink for contributors, skips ai 'dev': unique_flatten_dict(base_extras_light) + unique_flatten_dict(dev_extras), + #for people data synthesizer + 'data': unique_flatten_dict(base_extras_data), } setup( From d5ea8a0b86b43851c80da2ac2578740b1d36d462 Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:18:28 -0400 Subject: [PATCH 6/9] git mv and fix setup.py --- .../scripts/generator/{PersonGen.py => PersonGenerator.py} | 0 setup.py | 6 ++---- 2 files changed, 2 insertions(+), 4 deletions(-) rename demos/data/scripts/generator/{PersonGen.py => PersonGenerator.py} (100%) diff --git a/demos/data/scripts/generator/PersonGen.py b/demos/data/scripts/generator/PersonGenerator.py similarity index 100% rename from demos/data/scripts/generator/PersonGen.py rename to demos/data/scripts/generator/PersonGenerator.py diff --git a/setup.py b/setup.py index 425f68eb4d..5f4eea678d 100755 --- a/setup.py +++ b/setup.py @@ -44,12 +44,10 @@ def unique_flatten_dict(d): } base_extras_data = { - 'names-dataset': ['names-dataset'], - 'Faker': ['Faker'], - 'random-address': ['random-address'], - 'phone-gen': ['phone-gen'], + 'data-gen': ['names-dataset', 'faker', 'random-address','phone-gen'] } + base_extras_heavy = { 'umap-learn': ['umap-learn', 'dirty-cat==0.2.0', 'scikit-learn>=1.0'], } From 3eb89fb69903200d81a9ff0adbeb5c922fdad1ba Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Mon, 8 Apr 2024 13:47:34 -0400 Subject: [PATCH 7/9] minor fixes --- demos/data/scripts/generator/PersonGenerator.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/demos/data/scripts/generator/PersonGenerator.py b/demos/data/scripts/generator/PersonGenerator.py index d91def8ad7..11a86d1ffa 100644 --- a/demos/data/scripts/generator/PersonGenerator.py +++ b/demos/data/scripts/generator/PersonGenerator.py @@ -88,22 +88,17 @@ def generate_people( } record["email_address"] = record["first_name"] + record["last_name"] + str(self.random.randint(0, 999)) + self.random.choice(self.domains) records.append(record) - + df = pd.DataFrame(records) return df def get_address(self) -> dict: - # Placeholder for your existing `self.address()` method return real_random_address.RandomAddress() def get_address_by_state(self, state: str) -> dict: - # Placeholder for generating an address by state - # Implement actual functionality here return real_random_address.real_random_address_by_state(state) - def get_address_by_postal_code(self, postal_code: str) -> dict: - # Placeholder for generating an address by postal code return real_random_address.real_random_address_by_postal_code(postal_code) def generate_addresses( From 9ab127cefc14cc55c0e4057c9fa4260e6c30e23e Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:16:50 -0400 Subject: [PATCH 8/9] fix real random address import --- demos/data/scripts/generator/PersonGenerator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/demos/data/scripts/generator/PersonGenerator.py b/demos/data/scripts/generator/PersonGenerator.py index 11a86d1ffa..26bf21c419 100644 --- a/demos/data/scripts/generator/PersonGenerator.py +++ b/demos/data/scripts/generator/PersonGenerator.py @@ -1,7 +1,7 @@ import pandas as pd import random from faker import Faker -from random_address import real_random_address +import random_address from phone_gen import PhoneNumber from datetime import timedelta from datetime import datetime @@ -51,6 +51,7 @@ def __init__( self.random = random.Random(self.seed) self.phone = PhoneNumber(self.country) self.names = NameDataset() + self.address = random_address self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country] self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country] @@ -93,13 +94,13 @@ def generate_people( return df def get_address(self) -> dict: - return real_random_address.RandomAddress() + return self.address.real_random_address() def get_address_by_state(self, state: str) -> dict: - return real_random_address.real_random_address_by_state(state) + return self.address.real_random_address_by_state(state) def get_address_by_postal_code(self, postal_code: str) -> dict: - return real_random_address.real_random_address_by_postal_code(postal_code) + return self.address.real_random_address_by_postal_code(postal_code) def generate_addresses( self, From d0483f6bb040e3abbaeaff70125ee68585be25a1 Mon Sep 17 00:00:00 2001 From: webcoderz <19884161+webcoderz@users.noreply.github.com> Date: Mon, 17 Jun 2024 10:55:16 -0400 Subject: [PATCH 9/9] updating the crime network generator seperated the profile generation out into a seperate module that is now a faker factory --- .../generator/CrimeNetworkGenerator.py | 698 ++++++++++++++++++ .../data/scripts/generator/PersonGenerator.py | 434 ----------- .../scripts/generator/ProfileGenerator.py | 89 +++ setup.py | 2 +- 4 files changed, 788 insertions(+), 435 deletions(-) create mode 100644 demos/data/scripts/generator/CrimeNetworkGenerator.py delete mode 100644 demos/data/scripts/generator/PersonGenerator.py create mode 100644 demos/data/scripts/generator/ProfileGenerator.py diff --git a/demos/data/scripts/generator/CrimeNetworkGenerator.py b/demos/data/scripts/generator/CrimeNetworkGenerator.py new file mode 100644 index 0000000000..aaa06a9130 --- /dev/null +++ b/demos/data/scripts/generator/CrimeNetworkGenerator.py @@ -0,0 +1,698 @@ +import pandas as pd +import numpy as np +from sklearn.datasets import make_blobs +import factory.random +from datetime import datetime, timedelta +from ProfileGenerator import ProfileFactory +from scipy.spatial import cKDTree +from itertools import count +import graphistry + + +class PersonNetworkGenerator: + def __init__( + self, + n_kingpins: int = 4, + dealers_per_kingpin: int = 5, + users_per_dealer: int = 3, + dealer_normal_connections: int = 4, + kingpin_normal_connections: int = 3, + within_group_connections: int = 4, + random_connections: int = 3, + max_calls_per_edge: int = 11, + affiliations: list = ['Gang Alpha', 'Cartel Beta', 'Gang Gamma', 'Cartel Delta'], + crimes: list = [ + "Armed Robbery", + "Burglary", + "Drug Trafficking", + "Vandalism", + "Assault", + "Money Laundering", + "Fraud", + "Homicide", + ], + max_crimes_per_case: int = 3, + max_cases_per_person: int = 3, + n_normal: int = 1000, + postal_code: int = None, + state: str = None, + call_start_date: str = "2022-1-1", + call_end_date: str = "2023-12-31", + max_num_whereabouts: int = 4, + leader_to_leader_call_chance: float = 0.05, + shared_case_percentage: float = 0.3, + ): + + self.n_kingpins = n_kingpins + self.dealers_per_kingpin = dealers_per_kingpin + self.users_per_dealer = users_per_dealer + self.dealer_normal_connections = dealer_normal_connections + self.kingpin_normal_connections = kingpin_normal_connections + self.within_group_connections = within_group_connections + self.random_connections = random_connections + self.n_normal = n_normal + self.node_df = None + self.edge_df = None + self.labels = None + self.seed = 42 + np.random.seed(self.seed) + factory.random.reseed_random(self.seed) + self.postal_code = postal_code + self.state = state + self.affiliations = affiliations + self.crimes = crimes + self.max_crimes_per_case = max_crimes_per_case + self.max_cases_per_person = max_cases_per_person + self.max_calls_per_edge = max_calls_per_edge + self.call_start_date = call_start_date + self.call_end_date = call_end_date + self.max_num_whereabouts = max_num_whereabouts + self.leader_to_leader_call_chance = leader_to_leader_call_chance + self.shared_case_percentage = shared_case_percentage + + #NETWORK GENERATION + def generate_network(self): + # Generate clusters for kingpins, dealers, and users + X_kingpins, _ = make_blobs( + n_samples=self.n_kingpins, + centers=self.n_kingpins, + cluster_std=1.0, + random_state=self.seed + ) + + X_dealers, _ = make_blobs( + n_samples=self.dealers_per_kingpin * self.n_kingpins, + centers=X_kingpins, + cluster_std=2.5, + random_state=self.seed + ) + + X_users, _ = make_blobs( + n_samples=self.users_per_dealer * self.dealers_per_kingpin * self.n_kingpins, + centers=X_dealers, + cluster_std=3.5, + random_state=self.seed + ) + + X_normal = np.random.rand(self.n_normal, 2) * 100 # Normal people data + + # Combine all data + points = np.vstack([X_kingpins, X_dealers, X_users, X_normal]) + self.labels = ['kingpin']*self.n_kingpins + \ + ['dealer']*self.dealers_per_kingpin*self.n_kingpins + \ + ['user']*self.dealers_per_kingpin*self.n_kingpins*self.users_per_dealer + \ + ['normal']*self.n_normal + + # Create DataFrame for nodes + self.node_df = pd.DataFrame(points, columns=['x', 'y']) + self.node_df['node_id'] = range(len(self.node_df)) + self.node_df['type'] = self.labels + + # Assign personal details + self.assign_personal_details(self.postal_code, self.state, self.max_num_whereabouts) + + # assign affiliations + self.assign_affiliations() + + # Generate criminal records + self.generate_and_assign_criminal_records() + + # Generate edges + self.generate_edges() + + # Generate call logs + self.generate_and_assign_call_logs(self.call_start_date, self.call_end_date) + + def calculate_nearest_kingpin(self): + # Extract coordinates for kingpins and dealers + kingpin_coords = self.node_df[self.node_df['type'] == 'kingpin'][['x', 'y']].to_numpy() + dealer_coords = self.node_df[self.node_df['type'] == 'dealer'][['x', 'y']].to_numpy() + + # Find nearest kingpin index for each dealer + nearest_kingpin_indices = self.find_nearest_kingpin_index(dealer_coords, kingpin_coords) + + # Map nearest kingpin indices back to the original DataFrame indices of kingpins + kingpin_df_indices = self.node_df[self.node_df['type'] == 'kingpin'].index.to_numpy() + mapped_kingpin_indices = kingpin_df_indices[nearest_kingpin_indices] + + # Assign the mapped kingpin indices to dealers in the DataFrame + self.node_df.loc[self.node_df['type'] == 'dealer', 'nearest_kingpin_index'] = mapped_kingpin_indices + + def find_nearest_kingpin_index( + self, + dealer_coords: np.array, + kingpin_coords: np.array + ) -> np.array: + # Create a KD-tree for kingpin locations + tree = cKDTree(kingpin_coords) + + # Query the tree for the nearest kingpin to each dealer + # 'query' returns a tuple where the first element is the distance + # and the second element is the index of the nearest kingpin in the tree + _, nearest_kingpin_indices = tree.query(dealer_coords, k=1) + + return nearest_kingpin_indices + + def ensure_kingpin_dealer_connectivity(self) -> list: + edge_list = [] + kingpins = self.node_df[self.node_df['type'] == 'kingpin'] + for kingpin_index in kingpins.index: + affiliated_dealers = self.node_df[(self.node_df['type'] == 'dealer') & (self.node_df['affiliation'] == self.node_df.at[kingpin_index, 'affiliation'])].index + # Ensure each kingpin has connections to dealers + if not affiliated_dealers.empty: + selected_dealers = np.random.choice(affiliated_dealers, size=min(3, len(affiliated_dealers)), replace=False) + for dealer_index in selected_dealers: + edge_list.append((kingpin_index, dealer_index)) + return edge_list + + def connect_dealers_to_users(self) -> list: + edge_list = [] + dealers = self.node_df[self.node_df['type'] == 'dealer'] + users = self.node_df[self.node_df['type'] == 'user'].index + for dealer_index in dealers.index: + # Select a random number of users to connect with each dealer + num_connections = self.users_per_dealer # For example, each dealer connects with 2 to 4 users + selected_users = np.random.choice(users, size=num_connections, replace=False) + for user_index in selected_users: + edge_list.append((dealer_index, user_index)) + return edge_list + + def connect_within_group(self) -> list: + # Exclude kingpins and normal individuals for within-group connections + group_nodes = self.node_df[~self.node_df['type'].isin(['kingpin', 'normal'])] + + # Group by affiliation and type + grouped = group_nodes.groupby(['affiliation', 'type']) + + # Initialize an empty list to store edges + edge_list = [] + + # Iterate over each group + for name, group in grouped: + # Generate connections for each node in the group + for node_index in group.index: + # Identify potential connections within the same affiliation and type + potential_connections = group.index[group.index != node_index] + # Randomly select a subset for connections + num_connections = np.random.randint(1, self.within_group_connections) # Adjust numbers as needed + if not potential_connections.empty: + selected_connections = np.random.choice(potential_connections, size=min(len(potential_connections), num_connections), replace=False) + # Add connections to the edge list + edge_list.extend([(node_index, connection) for connection in selected_connections]) + + return edge_list + + def connect_randomly(self) -> list: + # Decide randomly if a node should form random connections + nodes_to_connect = self.node_df.index[np.random.rand(len(self.node_df)) < 0.1] + + # Function to generate random connections for a node + def generate_random_connections(node): + # Exclude self-connections + potential_connections = self.node_df.index[self.node_df.index != node] + num_connections = np.random.randint(1, self.random_connections) # Adjust numbers as needed + selected_connections = np.random.choice(potential_connections, size=min(len(potential_connections), num_connections), replace=False) + return [(node, connection) for connection in selected_connections] + + # Generate random connections for each selected node + edge_list = [edge for node in nodes_to_connect for edge in generate_random_connections(node)] + + return edge_list + + def connect_to_normals(self) -> list: + # Define which roles should have connections to normal individuals + roles_with_normal_connections = ['kingpin', 'dealer'] + + # Filter the DataFrame for normal individuals + normal_people = self.node_df[self.node_df['type'] == 'normal'].index + + # Filter the DataFrame for nodes that should have connections to normal individuals + nodes_to_connect = self.node_df[self.node_df['type'].isin(roles_with_normal_connections)] + + # Generate connections for each node + connections = nodes_to_connect.apply(lambda row: self.generate_normal_connections(row, normal_people), axis=1) + + # Flatten the list of connections + edge_list = [item for sublist in connections for item in sublist] + + return edge_list + + def generate_normal_connections( + self, + node_row: pd.DataFrame, + normal_people: pd.DataFrame.index + ) -> list: + + # Determine the number of normal connections (e.g., 1-3 for kingpins, 1-2 for dealers) + if node_row['type'] == 'kingpin': + num_connections = np.random.randint(1, self.kingpin_normal_connections) # Kingpins have 1 to 3 normal connections + else: # Dealers + num_connections = np.random.randint(1, self.dealer_normal_connections) # Dealers have 1 to 10 normal connections + + # Select random normal individuals to connect with + selected_normals = np.random.choice(normal_people, size=num_connections, replace=False) + + # Return a list of connections for the node + return [(node_row.name, normal_index) for normal_index in selected_normals] + + def generate_edges(self): + edge_list = [] + + # Initial connections based on affiliations and roles + # Ensure kingpin-dealer connectivity and dealer-user connections + edge_list.extend(self.ensure_kingpin_dealer_connectivity()) + + edge_list.extend(self.connect_dealers_to_users()) + + # Within-group connections + edge_list.extend(self.connect_within_group()) + + # Random connections across the network + edge_list.extend(self.connect_randomly()) + + # Connect kingpins and dealers to normal individuals + edge_list.extend(self.connect_to_normals()) + + # Convert edge list to DataFrame + self.edge_df = pd.DataFrame(edge_list, columns=['src', 'target']) + + def assign_personal_details( + self, + postal_code: str, + state: str, + max_num_whereabouts: int + ) -> None: + + details_df = self.generate_details( + num_records=len(self.node_df), + postal_code=postal_code, + state=state, + num_whereabouts=max_num_whereabouts + ) + self.node_df = pd.concat([self.node_df, details_df], axis=1) + return self.expand_whereabouts_to_columns() + + def flatten_dict(self, d: dict) -> dict: + items = [] + for key, value in d.items(): + if isinstance(value, dict): + items.extend(self.flatten_dict(value).items()) + else: + items.append((key, value)) + return dict(items) + + #PROFILE GENERATION + def generate_details( + self, + num_records: int, + postal_code: str, + state: str, + num_whereabouts: int + ) -> pd.DataFrame: + + return pd.DataFrame([self.flatten_dict(profile.to_dict()) for profile in ProfileFactory.create_batch(num_records, postal_code=postal_code, state=state, num_whereabouts=num_whereabouts)]) + + def expand_whereabouts_to_columns(self): + max_whereabouts = self.max_num_whereabouts + + # Create a temporary DataFrame from the 'whereabouts' series + whereabouts_df = self.node_df['whereabouts'].apply(pd.Series) + + # Iterate over the number of whereabouts + for i in range(max_whereabouts): + # Extract whereabouts details for each whereabouts + whereabouts_details_df = whereabouts_df[i].apply(pd.Series) + + # Assign address, from_date, to_date, and other details to the node DataFrame + self.node_df[f'whereabouts_{i+1}_address1'] = whereabouts_details_df['address1'] + self.node_df[f'whereabouts_{i+1}_address2'] = whereabouts_details_df['address2'] + self.node_df[f'whereabouts_{i+1}_city'] = whereabouts_details_df['city'] + self.node_df[f'whereabouts_{i+1}_state'] = whereabouts_details_df['state'] + self.node_df[f'whereabouts_{i+1}_postalCode'] = whereabouts_details_df['postalCode'] + self.node_df[f'whereabouts_{i+1}_coordinates'] = whereabouts_details_df['coordinates'] + # Flatten coordinates into lat and lng + coordinates_df = whereabouts_details_df['coordinates'].apply(pd.Series) + self.node_df[f'whereabouts_{i+1}_lat'] = coordinates_df['lat'] + self.node_df[f'whereabouts_{i+1}_lng'] = coordinates_df['lng'] + # Drop the coordinates column + self.node_df.drop(f'whereabouts_{i+1}_coordinates', axis=1, inplace=True) + + self.node_df[f'whereabouts_{i+1}_from_date'] = whereabouts_details_df['from_date'] + self.node_df[f'whereabouts_{i+1}_to_date'] = whereabouts_details_df['to_date'] + + + # Drop the original 'whereabouts' column + self.node_df.drop('whereabouts', axis=1, inplace=True) + # Replace NaN values with None + self.node_df = self.node_df.where(pd.notnull(self.node_df), None) + + @staticmethod + def random_datetime( + year: int, + month: int, + day: int, + hour_start: int, + hour_end: int + ) -> datetime: + + start = datetime(year, month, day, hour_start) + end = datetime(year, month, day, hour_end) + return start + timedelta( + seconds=np.random.randint(0, int((end - start).total_seconds())) + ) + + def assign_affiliations(self): + # Step 1: Assign an affiliation to each kingpin + kingpins = self.node_df[self.node_df['type'] == 'kingpin'] + + shuffled_affiliations = np.random.choice( + self.affiliations, + size=len(self.affiliations), + replace=False + ).tolist() + + for i, index in enumerate(kingpins.index): + if i < len(shuffled_affiliations): + # Assign a unique affiliation to each kingpin + self.node_df.at[index, 'affiliation'] = shuffled_affiliations[i] + else: + # If there are more kingpins than affiliations, assign random affiliations to the remaining kingpins + self.node_df.at[index, 'affiliation'] = np.random.choice(self.affiliations) + + # Step 2: Calculate nearest kingpin for dealers and assign affiliations + self.calculate_nearest_kingpin() + # Ensure dealers inherit their kingpin's affiliation + self.node_df.loc[self.node_df['type'] == 'dealer', 'affiliation'] = self.node_df.loc[self.node_df['type'] == 'dealer', 'nearest_kingpin_index'].map(lambda x: self.node_df.at[x, 'affiliation']) + + # Step 3: Assign 'None' to users and normal individuals + self.node_df.loc[self.node_df['type'].isin(['user', 'normal']), 'affiliation'] = 'None' + + def generate_and_assign_criminal_records(self): + unique_case_number = count(start=1000, step=1) # Unique case number generator + gang_related_cases = {} # To track gang-related case numbers and crimes + + # Generate number of cases for each person + self.node_df['num_cases'] = np.random.randint( + 0, + self.max_cases_per_person + 1, + size=len(self.node_df) + ) + + # Generate cases for each person + self.node_df['cases'] = self.node_df.apply( + lambda row: [ + self.generate_case( + row, + gang_related_cases, + unique_case_number + ) + for _ in range(row['num_cases']) + ], + axis=1 + ) + + # Drop the 'num_cases' column as it's no longer needed + self.node_df.drop('num_cases', axis=1, inplace=True) + return self.expand_cases_to_columns() + + def generate_case( + self, + person: pd.DataFrame, + gang_related_cases: dict, + unique_case_number: int + ) -> dict: + # Adjusted logic for determining shared or unique cases + if person['affiliation'] != 'None' and gang_related_cases.get(person['affiliation']) and np.random.random() < self.shared_case_percentage: + shared_case = np.random.choice(gang_related_cases[person['affiliation']]) + return shared_case + else: + case_num = next(unique_case_number) + crimes_in_case = np.random.choice( + self.crimes, + np.random.randint(1, 4), + replace=False + ).tolist() + + new_case = {"case_number": case_num, "crimes": crimes_in_case} + + if person['affiliation'] != 'None': + gang_related_cases.setdefault( + person['affiliation'], + [] + ).append(new_case) + + return new_case + + def expand_cases_to_columns(self): + max_crimes_per_case = self.max_crimes_per_case # Adjust based on your dataset + + # Create a temporary DataFrame from the 'cases' series + cases_df = self.node_df['cases'].apply(pd.Series) + + # Iterate over the number of cases + for i in range(max_crimes_per_case): + # Extract case details for each case + case_details_df = cases_df[i].apply(pd.Series) + + # Assign case number and crimes to the node DataFrame + self.node_df[f'case_number_{i+1}'] = case_details_df['case_number'].astype('Int64') + self.node_df[f'case_number_{i+1}'] = self.node_df[f'case_number_{i+1}'].astype('object') + + # Extract crimes for each case and assign to the node DataFrame + crimes_df = case_details_df['crimes'].apply(pd.Series) + for j in range(max_crimes_per_case): + self.node_df[f'crime_{i+1}_{j+1}'] = crimes_df[j] + + # Drop the original 'cases' column + self.node_df.drop('cases', axis=1, inplace=True) + # Replace NaN values with None + self.node_df = self.node_df.where(pd.notnull(self.node_df), None) + + #CALL LOG GENERATION + def generate_phone_numbers(self): + # Assuming self.node_df exists and has been populated + self.teledict = self.node_df['phone'].to_dict() + + def generate_and_assign_call_logs(self, start_date, end_date): + # Parse date strings + start_date = datetime.strptime(start_date, '%Y-%m-%d') \ + if isinstance(start_date, str) else start_date + + end_date = datetime.strptime(end_date, '%Y-%m-%d') \ + if isinstance(end_date, str) else end_date + + # Ensure phone numbers are generated + if not hasattr(self, 'teledict'): + self.generate_phone_numbers() + + # Define a function to generate call logs for a given edge + def generate_call_logs(edge: dict) -> list: + # Check if the edge exists in self.edge_df + if edge['src'] not in self.node_df.index or edge['target'] not in self.node_df.index: + # If the edge doesn't exist, manually set the caller and callee types to 'kingpin' + caller_type = 'kingpin' + callee_type = 'kingpin' + else: + # If the edge does exist, get the caller and callee types from self.node_df + caller_type = self.node_df.loc[edge['src'], 'type'] + callee_type = self.node_df.loc[edge['target'], 'type'] + + # Determine the number of calls for this edge (e.g., 1-10) + num_calls = np.random.randint(1, self.max_calls_per_edge) + + # Assign caller and callee phone numbers + caller = self.teledict[edge['src']] + callee = self.teledict[edge['target']] + # Determine the number of calls for this edge (e.g., 1-10) + + # Check if the call is inter-gang (caller is a kingpin and callee is a dealer from diff gang) + if caller_type == 'kingpin' and callee_type == 'dealer' and self.node_df.loc[edge['src'], 'affiliation'] != self.node_df.loc[edge['target'], 'affiliation']: + call_type = 'inter-gang' + # Check if the call is inter-gang (both nodes are kingpins from different gangs) + elif caller_type == 'kingpin' and callee_type == 'kingpin' and self.node_df.loc[edge['src'], 'affiliation'] != self.node_df.loc[edge['target'], 'affiliation']: + call_type = 'inter-gang' + # Check if the call is intra-gang (caller is a kingpin and callee is a dealer from the same gang) + elif caller_type == 'kingpin' and callee_type == 'dealer' and self.node_df.loc[edge['src'], 'affiliation'] == self.node_df.loc[edge['target'], 'affiliation']: + call_type = 'intra-gang' + #dealer to dealer intra-gang + elif caller_type == 'dealer' and callee_type == 'dealer' and self.node_df.loc[edge['src'], 'affiliation'] == self.node_df.loc[edge['target'], 'affiliation']: + call_type = 'intra-gang' + # All other calls are non-affiliated + else: + call_type = 'non-affiliated' + + # Return a list of call logs for this edge + return [{ + 'src': edge['src'], + 'target': edge['target'], + 'caller': caller, + 'callee': callee, + 'call_time': self.random_datetime( + year=start_date.year + np.random.randint(0, (end_date - start_date).days // 365), + month=np.random.randint(1, 13), + day=np.random.randint(1, 29), + hour_start=0 if caller_type in ['user', 'normal'] else 8, + hour_end=23 if caller_type in ['user', 'normal'] else 22 + ).strftime('%Y-%m-%d %H:%M:%S'), + 'duration_minutes': np.random.randint(5, 61) if caller_type in ['user', 'normal'] else np.random.randint(1, 16), + 'call_type': call_type + } for _ in range(num_calls)] + + # Generate call logs for each edge + call_logs = self.edge_df.apply(generate_call_logs, axis=1).tolist() + + # Generate inter-gang calls between kingpins + kingpins = self.node_df[self.node_df['type'] == 'kingpin'] + kingpin_calls = [] + + for i in range(len(kingpins)): + for j in range(i + 1, len(kingpins)): + if kingpins.iloc[i]['affiliation'] != kingpins.iloc[j]['affiliation'] and np.random.random() < self.leader_to_leader_call_chance: # 5% chance of a call + edge = {'src': kingpins.index[i], 'target': kingpins.index[j]} # Use index here + kingpin_calls.append(generate_call_logs(edge)) + + kg_calls = pd.DataFrame(kingpin_calls) + call_logs_df = pd.DataFrame(call_logs) + # Flatten the DataFrame + flattened_df = pd.json_normalize( + call_logs_df.apply(lambda x: x.tolist(), axis=1) + .explode() + .dropna() + .tolist() + ) + + flattened_king_df = pd.json_normalize( + kg_calls.apply(lambda x: x.tolist(), axis=1) + .explode() + .dropna() + .tolist() + ) + + # Drop rows and columns that are entirely NaN + flattened_df = flattened_df \ + .dropna(axis=0, how='all') \ + .dropna(axis=1, how='all') + + flattened_king_df = flattened_king_df \ + .dropna(axis=0, how='all') \ + .dropna(axis=1, how='all') + + # Assign the flattened DataFrame to self.edge_df + self.edge_df = pd.concat([flattened_king_df, flattened_df]) + + def to_graph( + self, + size_dict: dict = None, + edge_influence: int = 7, + icon_mapping: dict = None, + color_mapping: dict = None + ) -> graphistry.plotter.Plotter: + + ndf = self.node_df.copy() + edf = self.edge_df.copy() + + edge_counts = edf.groupby(['src', 'target', 'call_type']) \ + .size() \ + .reset_index(name='weight') + + # Default size_dict if none is provided + if size_dict is None: + size_dict = {'kingpin': 200, 'dealer': 75, 'user': 50, 'normal': 25} + + ndf['size'] = ndf['type'].map(size_dict) + + # Default icon_mapping if none is provided + if icon_mapping is None: + icon_mapping = { + 'kingpin': 'user-o', + 'dealer': 'user-md', + 'user': 'users', + 'normal': 'universal-access', + } + + # Default color_mapping if none is provided + if color_mapping is None: + color_mapping = { + 'non-affiliated': 'blue', + 'intra-gang': 'red', + 'inter-gang': 'orange' + } + + g = ( + graphistry.nodes(ndf, 'node_id') + .edges(edge_counts, 'src', 'target') + .bind(point_title='type', point_size='size') + .bind(edge_weight="weight", edge_color="call_type") + .settings(url_params={'edgeInfluence': edge_influence}) + .encode_point_icon('type', categorical_mapping=icon_mapping) + .encode_edge_color( + 'call_type', + categorical_mapping=color_mapping, + default_mapping='#CCC' + ) + ) + + return g + + def get_dealer_to_user_edges_and_nodes( + self, + affiliated_nodes: pd.DataFrame + ) -> tuple: + # Filter the node DataFrame to only include dealers + affiliated_dealers = affiliated_nodes[affiliated_nodes['type'] == 'dealer'] + + # Join the edges and nodes dataframes on the 'target' column + edges_with_node_types = self.edge_df.merge(self.node_df[['node_id', 'type']], left_on='target', right_on='node_id', how='left') + + # Filter the joined dataframe to only include edges from dealers to users + dealer_to_user_edges_df = edges_with_node_types[(edges_with_node_types['src'].isin(affiliated_dealers['node_id'])) & (edges_with_node_types['type'] == 'user')] + + # Create the dealer to user edges + dealer_to_user_edges = dealer_to_user_edges_df[['src', 'target']].copy() + dealer_to_user_edges['role'] = 'user' + dealer_to_user_edges['affiliation'] = dealer_to_user_edges['src'].map(affiliated_dealers['affiliation']) + + # Get the user nodes + user_nodes = self.node_df[self.node_df['node_id'].isin(dealer_to_user_edges['target'])] + + return dealer_to_user_edges, user_nodes + + def to_tree(self, affiliation: str) -> graphistry.plotter.Plotter: + # Filter the node DataFrame by the specified affiliation + affiliated_nodes = self.node_df[self.node_df['affiliation'] == affiliation].copy() + affiliated_nodes.loc[:, "node_label"] = affiliated_nodes["first_name"] + " " + affiliated_nodes["last_name"] + + dealer_to_user_edges, user_nodes = self.get_dealer_to_user_edges_and_nodes(affiliated_nodes) + + user_nodes = pd.DataFrame(user_nodes) + user_nodes.loc[:, "node_label"] = user_nodes["first_name"] + " " + user_nodes["last_name"] + + # Get the kingpin node + kingpin_node = affiliated_nodes[affiliated_nodes['type'] == 'kingpin']['node_id'].values[0] + + # Add dealer nodes and edges to the dataframes based on the affiliations + dealer_nodes = affiliated_nodes[affiliated_nodes['type'] == 'dealer'] + dealer_edges = pd.DataFrame({ + 'src': kingpin_node, + 'target': dealer_nodes['node_id'], + 'role': dealer_nodes['type'], + 'affiliation': dealer_nodes['affiliation'] + }) + + # Add dealer to user edges to the new_edges DataFrame + new_edges = pd.concat([dealer_edges, dealer_to_user_edges]) + + # Add user nodes to the new_nodes DataFrame + new_nodes = pd.concat([affiliated_nodes, user_nodes]) + + g = graphistry.bind( + source='src', + destination='target', + node='node_id', + point_title='node_label' + ).edges(new_edges).nodes(new_nodes) + g = g.encode_point_color('type', categorical_mapping={'kingpin': 'red', 'dealer': 'blue', 'user': 'green'}, default_mapping='gray') + g = g.encode_point_icon('type', categorical_mapping={'kingpin': 'user-o', 'dealer': 'user-md', 'user': 'users'}) + g = g.settings(url_params={'play': 0, "edgeCurvature": 0.0}) + g = g.tree_layout(width=100, height=50) + return g \ No newline at end of file diff --git a/demos/data/scripts/generator/PersonGenerator.py b/demos/data/scripts/generator/PersonGenerator.py deleted file mode 100644 index 26bf21c419..0000000000 --- a/demos/data/scripts/generator/PersonGenerator.py +++ /dev/null @@ -1,434 +0,0 @@ -import pandas as pd -import random -from faker import Faker -import random_address -from phone_gen import PhoneNumber -from datetime import timedelta -from datetime import datetime -import numpy as np -from itertools import count -from names_dataset import NameDataset - - -class PersonGenerator: - - def __init__( - self, - seed: int = 0, - country: str = 'US', - people_amt: int = 100, - affiliations: list = ['Gang Alpha', 'Cartel Beta', 'Gang Gamma', 'Cartel Delta'], - crimes: list = [ - "Armed Robbery", - "Burglary", - "Drug Trafficking", - "Vandalism", - "Assault", - "Money Laundering", - "Fraud", - "Homicide", - ] - ): - - """ - PersonGenerator: - A class to generate synthetic person records; - including basic information, addresses, call logs, affiliations, and criminal records. - - :param seed: Seed for random number generation. - :param country: Country code for phone number generation. - :param people_amt: Number of people to generate. - :param affiliations: List of affiliations to generate. - :param crimes: List of crimes to generate. - """ - - self.seed = seed - self.country = country - self.people_num = people_amt - Faker.seed(self.seed) - np.random.seed(self.seed) - self.fake = Faker() - self.random = random.Random(self.seed) - self.phone = PhoneNumber(self.country) - self.names = NameDataset() - self.address = random_address - self.first_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country)[self.country] - self.last_names = self.names.get_top_names(n=self.people_num, country_alpha2=self.country, use_first_names=False)[self.country] - - self.domains = pd.read_csv("domains.txt", header=None)[0].to_list() - self.affiliations = affiliations - self.crimes = crimes - - def generate_people( - self, - num_records: int = 100, - min_age: int = 15, - max_age: int = 85 - ) -> pd.DataFrame: - - """ - Generate a set of people records with basic information. - :param num_records: Number of records to generate. - :param min_age: Minimum age for date of birth generation. - :param max_age: Maximum age for date of birth generation. - :return: DataFrame of people records. - """ - records = [] - for _ in range(num_records): - gender = ["M", "F"] - sex = self.random.choice(gender) - record = { - "first_name": self.random.choice(self.first_names[sex]), - "last_name": self.random.choice(self.last_names), - "phone_number": self.phone.get_number(full=False), - "sex": sex, - "DOB": self.fake.date_of_birth( - minimum_age=min_age, - maximum_age=max_age - ), - } - record["email_address"] = record["first_name"] + record["last_name"] + str(self.random.randint(0, 999)) + self.random.choice(self.domains) - records.append(record) - - df = pd.DataFrame(records) - return df - - def get_address(self) -> dict: - return self.address.real_random_address() - - def get_address_by_state(self, state: str) -> dict: - return self.address.real_random_address_by_state(state) - - def get_address_by_postal_code(self, postal_code: str) -> dict: - return self.address.real_random_address_by_postal_code(postal_code) - - def generate_addresses( - self, - num_records: int = 100, - start_date: str = "-30y", - end_date: str = "today", - state: str = None, - postal_code: str = None - ) -> pd.DataFrame: - """ - Generate addresses for a set of people, simulating a history of addresses. - :param num_records: Number of addresses to generate. - :param start_date: Start date for address history. - :param end_date: End date for address history. - :param state: Optional state to generate addresses for. - :param postal_code: Optional postal code to generate addresses for. - :return: DataFrame of addresses. - :raises ValueError: If both state and postal_code are provided. - """ - - if state and postal_code: - raise ValueError("Cannot specify both state and postal code. Please choose one.") - - records = [] - for _ in range(num_records): - if state: - address = self.get_address_by_state(state) - elif postal_code: - address = self.get_address_by_postal_code(postal_code) - else: - address = self.get_address() - - record = { - "address1": address.get('address1', ''), - "address2": address.get('address2', ''), - "city": address.get("city", "Unknown City"), - "date": self.fake.date_between(start_date=start_date, end_date=end_date), - "state": address.get("state", "Unknown State"), - "zip": address.get("postalCode", "Unknown PostalCode"), - "lat": address.get("coordinates", {}).get("lat", 0.0), - "lon": address.get("coordinates", {}).get("lng", 0.0) - } - records.append(record) - df = pd.DataFrame(records) - return df - - def generate_call_logs( - self, - people_df: pd.DataFrame, - num_logs: int = 500, - start_date: str = '-1y' - ) -> pd.DataFrame: - """ - Generate call logs for a set of people, simulating everyday calls. - :param people_df: DataFrame of people with affiliations. - :param num_logs: Number of call logs to generate. - :param start_date: Start date for call logs. - :return: DataFrame of call logs. - """ - call_logs = [] - phone_numbers = people_df['phone_number'].tolist() - - for _ in range(num_logs): - caller, callee = self.random.sample(phone_numbers, 2) # Ensure caller and callee are different - call_date = self.fake.date_time_between(start_date=start_date) - call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59)) - duration = self.random.randint(1, 3600) # Call duration in seconds, from 1 sec to 1 hour - - call_logs.append({ - "caller": caller, - "callee": callee, - "call_date": call_date.strftime('%Y-%m-%d'), - "call_time": call_time.strftime('%H:%M:%S'), - "duration_sec": duration - }) - - return pd.DataFrame(call_logs) - - def generate_non_affiliated_call_logs( - self, - people_df: pd.DataFrame, - call_logs_df: pd.DataFrame, - num_calls: int = 500, - start_date: str = '-1y' - ) -> pd.DataFrame: - """ - Generate call logs for non-affiliated individuals, simulating everyday calls. - - :param people_df: DataFrame of people with affiliations. - :param call_logs_df: DataFrame of call logs to append to. - :param num_calls: Number of calls to generate among non-affiliated individuals. - :return: Updated DataFrame with non-affiliated call logs. - """ - # Filter for non-affiliated individuals - non_affiliated_people = people_df[people_df['affiliation'] == 'None'] - - # Generate call logs - for _ in range(num_calls): - if len(non_affiliated_people) > 1: - caller, callee = non_affiliated_people.sample(n=2, replace=False)['phone_number'].values - self.add_call_log(call_logs_df, caller, callee, start_date) - - return call_logs_df - - def generate_affiliated_call_logs( - self, - people_df: pd.DataFrame, - call_logs_df: pd.DataFrame, - num_affiliated_calls: int = 100, - leader_call_percentage: float = 0.05, - start_date: str = '-1y' - ) -> pd.DataFrame: - """ - Generate call logs with a focus on gang affiliations, including both intra-gang and inter-gang communications. - - :param people_df: DataFrame of people with affiliations. - :param call_logs_df: Existing DataFrame of call logs to append to. - :param num_affiliated_calls: Number of additional affiliated calls to generate. - :param leader_call_percentage: Percentage of calls that should be between gang leaders (inter-gang calls). - :return: Updated DataFrame with affiliated call logs. - """ - affiliated_people = people_df[people_df['affiliation'] != 'None'] - affiliated_groups = affiliated_people['affiliation'].unique() - - leader_calls = int(num_affiliated_calls * leader_call_percentage) - gang_calls = num_affiliated_calls - leader_calls - - # Generate intra-gang calls - for _ in range(gang_calls): - gang = self.random.choice(affiliated_groups) - gang_members = affiliated_people[affiliated_people['affiliation'] == gang] - - if len(gang_members) > 1: - caller, callee = gang_members.sample(n=2, replace=False)['phone_number'].values - self.add_call_log(call_logs_df, caller, callee, start_date) - - # Generate inter-gang calls (leader calls) - for _ in range(leader_calls): - gangs = self.random.sample(list(affiliated_groups), 2) - for gang in gangs: - gang_leader = affiliated_people[affiliated_people['affiliation'] == gang].sample(n=1)['phone_number'].values[0] - if gang == gangs[0]: - caller = gang_leader - else: - callee = gang_leader - self.add_call_log(call_logs_df, caller, callee, start_date) - - return call_logs_df - - def add_call_log( - self, - call_logs_df: pd.DataFrame, - caller: str, - callee: str, - start_date: str - ) -> pd.DataFrame: - """ - Helper function to add a call log entry to the DataFrame. - """ - call_date = self.fake.date_time_between(start_date=start_date) - call_time = call_date + timedelta(hours=self.random.randint(0, 23), minutes=self.random.randint(0, 59), seconds=self.random.randint(0, 59)) - duration = self.random.randint(1, 3600) # Duration in seconds, from 1 sec to 1 hour - - new_entry = pd.DataFrame([{ - "caller": caller, - "callee": callee, - "call_date": call_date.strftime('%Y-%m-%d'), - "call_time": call_time.strftime('%H:%M:%S'), - "duration_sec": duration - }]) - - return pd.concat([call_logs_df, new_entry], ignore_index=True) - - def generate_affiliations( - self, - people_df: pd.DataFrame, - percentage_affiliated: float = 0.1, - lambda_param: float = 1.5 - ) -> pd.DataFrame: - """ - Generate affiliations for a subset of the provided DataFrame of people. - - :param people_df: DataFrame of people. - :param percentage_affiliated: Approximate percentage of people to have affiliations. - :param lambda_param: Lambda parameter for the exponential distribution, controlling affiliation spread. - :return: Updated DataFrame with an 'affiliation' column. - """ - num_people = len(people_df) - num_affiliated = int(num_people * percentage_affiliated) - - # Determine number of people affiliated with each group, ensuring sum equals num_affiliated - affiliation_counts = np.random.exponential(lambda_param, len(self.affiliations)) - affiliation_counts = np.round((affiliation_counts / affiliation_counts.sum()) * num_affiliated).astype(int) - - # Adjust in case rounding errors cause a mismatch in total counts - while affiliation_counts.sum() != num_affiliated: - if affiliation_counts.sum() > num_affiliated: - affiliation_counts[np.argmax(affiliation_counts)] -= 1 - else: - affiliation_counts[np.argmin(affiliation_counts)] += 1 - # Assign affiliations to randomly selected people - people_df['affiliation'] = 'None' - already_selected = set() - for count, affiliation in zip(affiliation_counts, self.affiliations): - eligible_indices = [i for i in range(num_people) if i not in already_selected] - selected_indices = self.random.sample(eligible_indices, count) - people_df.loc[selected_indices, 'affiliation'] = affiliation - already_selected.update(selected_indices) - - return people_df - - def assign_whereabouts_to_people( - self, - people_df: pd.DataFrame, - addresses_df: pd.DataFrame, - percent_cohabitating: float = 0.2 - ) -> pd.DataFrame: - """ - Assign addresses to people, ensuring that gang-affiliated individuals may share addresses. - - :param people_df: DataFrame of people. - :param addresses_df: DataFrame of addresses. - :param percent_cohabitating: Percentage of gang-affiliated individuals who share addresses. - :return: Updated DataFrame with address details. - """ - - # Initially, each person gets a unique address by default (if enough addresses) - if len(addresses_df) >= len(people_df): - people_df = pd.concat([people_df, addresses_df.sample(len(people_df)).reset_index(drop=True)], axis=1) - else: - raise ValueError("Not enough addresses to assign to each person uniquely.") - - # Identify gang-affiliated individuals for potential cohabitation - affiliated_groups = people_df[people_df['affiliation'] != 'None']['affiliation'].unique() - - for gang in affiliated_groups: - gang_members = people_df[people_df['affiliation'] == gang] - # Decide on how many addresses to group gang members at (e.g., 20% of gang members share addresses) - num_addresses = int(len(gang_members) * percent_cohabitating) - shared_addresses = addresses_df.sample(num_addresses) - - for idx, address in shared_addresses.iterrows(): - # Randomly select gang members to live together - members_to_live_together = gang_members.sample(n=2 if len(gang_members) > 1 else 1) # At least 2 if possible - for _, member in members_to_live_together.iterrows(): - people_df.loc[member.name, ['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']] = address[['address1', 'address2', 'city', 'state', 'zip', 'lat', 'lon', 'date']] - - # Remove the selected members to avoid reselection - gang_members = gang_members.drop(members_to_live_together.index) - - return people_df - - def expand_cases_to_columns(self, people_df: pd.DataFrame) -> pd.DataFrame: - """ - Helper function to expand the 'cases' column into multiple columns for case details. - - :param people_df: DataFrame of people with 'cases' column. - :return: Updated DataFrame with expanded case details. - """ - - # Create columns for case details - max_crimes_per_case = 3 # Adjust based on your dataset - for i in range(max_crimes_per_case): - people_df[f'case_number_{i+1}'] = None - for j in range(max_crimes_per_case): - people_df[f'crime_{i+1}_{j+1}'] = None - - for index, row in people_df.iterrows(): - for i, case in enumerate(row['cases']): - if i < max_crimes_per_case: - people_df.at[index, f'case_number_{i+1}'] = case['case_number'] - for j, crime in enumerate(case['crimes']): - if j < max_crimes_per_case: - people_df.at[index, f'crime_{i+1}_{j+1}'] = crime - - # Drop the original 'cases' column if no longer needed - # people_df.drop('cases', axis=1, inplace=True) - - return people_df - - def generate_and_assign_criminal_records( - self, - people_df: pd.DataFrame, - max_cases_per_person: int = 3 - ) -> pd.DataFrame: - - """ - Generate criminal records for a subset of the provided DataFrame of people. - - :param people_df: DataFrame of people with affiliations. - :param max_cases_per_person: Maximum number of cases to generate for each person. - :return: Updated DataFrame with criminal records. - """ - unique_case_number = count(start=1000, step=1) # Unique case number generator - criminal_records = [] # To collect criminal record entries - gang_related_cases = {} # To track gang-related case numbers and crimes - - for index, person in people_df.iterrows(): - num_cases = self.random.randint(0, max_cases_per_person) # Decide how many cases, if any - records_for_person = {"person_id": index, "cases": []} - - for _ in range(num_cases): - # Determine if this case is shared (for gang members) or unique - if person['affiliation'] != 'None' and gang_related_cases.get(person['affiliation']) and self.random.random() < 0.3: - # Share an existing case - shared_case = self.random.choice(gang_related_cases[person['affiliation']]) - records_for_person["cases"].append(shared_case) - else: - # Create a new case with 1 or more crimes - case_num = next(unique_case_number) - crimes_in_case = self.random.sample(self.crimes, self.random.randint(1, min(3, len(self.crimes)))) # Up to 3 crimes per case, adjust as needed - new_case = {"case_number": case_num, "crimes": crimes_in_case} - records_for_person["cases"].append(new_case) - - # If gang-affiliated, add this case to the gang's record for potential sharing - if person['affiliation'] != 'None': - if person['affiliation'] not in gang_related_cases: - gang_related_cases[person['affiliation']] = [] - gang_related_cases[person['affiliation']].append(new_case) - - criminal_records.append(records_for_person) - - # Convert to DataFrame and merge - criminal_records_df = pd.DataFrame(criminal_records) - people_df = pd.merge(people_df, criminal_records_df, how='left', left_index=True, right_on='person_id') - people_df.drop('person_id', axis=1, inplace=True) - - # Handle individuals with no criminal records - people_df['cases'] = people_df['cases'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else []) - - return self.expand_cases_to_columns(people_df) \ No newline at end of file diff --git a/demos/data/scripts/generator/ProfileGenerator.py b/demos/data/scripts/generator/ProfileGenerator.py new file mode 100644 index 0000000000..b866f9f58c --- /dev/null +++ b/demos/data/scripts/generator/ProfileGenerator.py @@ -0,0 +1,89 @@ +import factory +from datetime import datetime, timedelta +import pandas as pd + +import numpy as np +import random_address + + +class Profile: + def __init__( + self, + firstname, + lastname, + phone_number, + username, email, + address, + dob, + whereabouts, + num_whereabouts=None, + postal_code=None, + state=None, + rand_num=None + ): + + self.postal_code = postal_code + self.state = state + self.username = username + self.email = email + self.firstname = firstname + self.lastname = lastname + self.phone_number = phone_number + self.address = address + self.DOB = dob + self.whereabouts = whereabouts + self.rand_num = rand_num + self.num_whereabouts = num_whereabouts + + def to_dict(self): + return {"first_name": self.firstname, + "last_name": self.lastname, + "user_name": self.username, + "DOB": self.DOB, + "email": self.email, + "phone": self.phone_number, + "address": self.address, + "whereabouts": self.whereabouts + } + + def __str__(self): + return str(self.__dict__) + + +#profile factory +class ProfileFactory(factory.Factory): + class Meta: + model = Profile + # Optional parameters for address generation + state = None + postal_code = None + num_whereabouts = None + rand_num = factory.LazyFunction(lambda: str(np.random.randint(0, 999))) + username = factory.LazyAttribute(lambda obj: f"{obj.firstname}.{obj.lastname}{obj.rand_num}".lower()) + email = factory.LazyAttribute(lambda obj: f"{obj.firstname}.{obj.lastname}@{str(obj.rand_num) + np.random.choice(pd.read_csv('domains.txt', header=None)[0].to_list())}".lower()) + dob = factory.LazyFunction(lambda: (datetime.today() - timedelta(days=np.random.randint(15 * 365, 85 * 365))).strftime('%m-%d-%Y')) + firstname = factory.Faker('first_name') + lastname = factory.Faker('last_name') + phone_number = factory.Faker('basic_phone_number', locale="en_US") + address = factory.LazyAttribute(lambda obj: ProfileFactory.generate_address(state=obj.state, postal_code=obj.postal_code, from_date=(datetime.today() - timedelta(days=np.random.randint(0, 365))).strftime('%m-%d-%Y'), to_date=datetime.today().strftime('%m-%d-%Y'))) + whereabouts = factory.LazyAttribute(lambda obj: [ProfileFactory.generate_address(state=obj.state, postal_code=obj.postal_code, from_date=(datetime.today() - timedelta(days=np.random.randint(365, 365 * 5))).strftime('%m-%d-%Y'), to_date=(datetime.today() - timedelta(days=np.random.randint(0, 365))).strftime('%m-%d-%Y')) for _ in range(obj.num_whereabouts)]) + + @staticmethod + def generate_address(state=None, postal_code=None, from_date=None, to_date=None) -> dict: + """ + Function to generate an address. + """ + if state and postal_code: + raise ValueError("Cannot specify both state and postal code. Please choose one.") + elif state: + address = random_address.real_random_address_by_state(state) + elif postal_code: + address = random_address.real_random_address_by_postal_code(postal_code) + else: + address = random_address.real_random_address() + + # Add dates to address + address['from_date'] = from_date + address['to_date'] = to_date + + return address \ No newline at end of file diff --git a/setup.py b/setup.py index 5f4eea678d..7964a7dbd4 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ def unique_flatten_dict(d): } base_extras_data = { - 'data-gen': ['names-dataset', 'faker', 'random-address','phone-gen'] + 'data-gen': ['random-address', 'factory_boy'] }