-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.py
More file actions
86 lines (71 loc) · 3.03 KB
/
analysis.py
File metadata and controls
86 lines (71 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import argparse
import os
import pandas as pd
import matplotlib.pyplot as plt
def main():
parser = argparse.ArgumentParser(description="Farm Sales Data Analysis (cleaning + stats + charts).")
parser.add_argument("--input", default="data/farm_sales.csv", help="Path to input CSV")
parser.add_argument("--outdir", default="outputs", help="Output folder for cleaned data and charts")
args = parser.parse_args()
os.makedirs(args.outdir, exist_ok=True)
# Load
df = pd.read_csv(args.input)
# Basic cleaning
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date", "crop", "area_ha", "input_cost_tzs", "yield_kg", "price_per_kg_tzs"])
# Remove duplicates (same date + crop + yield)
df = df.drop_duplicates(subset=["date", "crop", "yield_kg"])
# Feature engineering
df["revenue_tzs"] = df["yield_kg"] * df["price_per_kg_tzs"]
df["profit_tzs"] = df["revenue_tzs"] - df["input_cost_tzs"]
df["month"] = df["date"].dt.to_period("M").astype(str)
# Save cleaned data
cleaned_path = os.path.join(args.outdir, "cleaned_farm_sales.csv")
df.to_csv(cleaned_path, index=False)
# Summary
print("\n=== DATA SUMMARY ===")
print(f"Rows: {len(df)}")
print("\nColumns:")
print(", ".join(df.columns))
print("\n=== OVERALL STATS (TZS) ===")
print(f"Total input cost: {df['input_cost_tzs'].sum():,.0f}")
print(f"Total revenue : {df['revenue_tzs'].sum():,.0f}")
print(f"Total profit : {df['profit_tzs'].sum():,.0f}")
print("\n=== PROFIT BY CROP (TZS) ===")
by_crop = df.groupby("crop")[["profit_tzs", "revenue_tzs", "input_cost_tzs", "yield_kg"]].sum().sort_values("profit_tzs", ascending=False)
print(by_crop.to_string())
# Chart 1: Profit by crop (bar)
chart1_path = os.path.join(args.outdir, "profit_by_crop.png")
by_crop["profit_tzs"].plot(kind="bar")
plt.title("Profit by Crop (TZS)")
plt.xlabel("Crop")
plt.ylabel("Profit (TZS)")
plt.tight_layout()
plt.savefig(chart1_path)
plt.close()
# Chart 2: Monthly revenue trend (line)
monthly = df.groupby("month")[["revenue_tzs", "profit_tzs"]].sum()
chart2_path = os.path.join(args.outdir, "monthly_revenue_profit.png")
monthly.plot(kind="line", marker="o")
plt.title("Monthly Revenue & Profit (TZS)")
plt.xlabel("Month")
plt.ylabel("TZS")
plt.tight_layout()
plt.savefig(chart2_path)
plt.close()
# Chart 3: Yield distribution (hist)
chart3_path = os.path.join(args.outdir, "yield_distribution.png")
df["yield_kg"].plot(kind="hist", bins=8)
plt.title("Yield Distribution (kg)")
plt.xlabel("Yield (kg)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(chart3_path)
plt.close()
print("\n✅ Saved outputs:")
print(f"- Cleaned data: {cleaned_path}")
print(f"- Chart: {chart1_path}")
print(f"- Chart: {chart2_path}")
print(f"- Chart: {chart3_path}")
if __name__ == "__main__":
main()