In [None]:
# 시장 데이터 수집 및 분석 자동화 시스템 v1.0
# - 산업 구조 분석 확장
# - 산업별 성장률 계산, 위험도 평가, 텍스트 마이닝 추가

import pandas as pd
import numpy as np
import yfinance as yf
import requests
import datetime
import os
import matplotlib.pyplot as plt
import seaborn as sns
import feedparser
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from textblob import TextBlob
from statsmodels.tsa.arima.model import ARIMA
from google.colab import drive
from collections import defaultdict

warnings.filterwarnings("ignore")

# Google Drive 마운트
drive.mount('/content/drive')
base_path = "/content/drive/MyDrive/MarketAnalysis"
os.makedirs(base_path, exist_ok=True)

# 사용자 설정
TICKERS = ["AAPL", "MSFT"]  # 다중 종목 지원
START_DATE = "2019-01-01"
END_DATE = "2024-12-31"
FORECAST_PERIOD = 5
BANK_API_KEY = "CD2MAHGKWH56WMLLHB72"
KOSIS_API_KEY = "NWVhOTg3NjU1NTEzNjY2YzQxZmY4MzczODIyMTdhNTU="

# 산업 성장률 기준 상위 산업 3개 → 자동 분석 대상 종목 구성
top_industries = sorted(INDUSTRY_GROWTH.items(), key=lambda x: x[1], reverse=True)[:3]
top_industry_names = [industry for industry, growth in top_industries]

# 해당 산업에 속한 TICKER 자동 추출
ANALYSIS_TICKERS = [ticker for ticker, industry in INDUSTRY_DB.items() if industry in top_industry_names]

print(f"✅ 분석 대상 종목: {ANALYSIS_TICKERS} (기준: 성장률 상위 산업 3개)")

# 유틸: 깨진 텍스트 정리

def clean_text(text):
    try:
        return text.encode('utf-8', 'ignore').decode('utf-8')
    except:
        return ""

# 1. 주가 데이터 수집

def fetch_price_data(ticker, start, end):
    data = yf.download(ticker, start=start, end=end)
    data.to_excel(f"{base_path}/{ticker}_raw_price.xlsx")
    return data

# 2. 정제 및 수익률 계산

def clean_data(df):
    df = df.dropna()
    df['Return'] = df['Close'].pct_change()
    df['LogReturn'] = np.log(df['Close'] / df['Close'].shift(1))
    return df

# 3. 한국은행 CPI 수집

def fetch_bok_cpi(api_key):
    url = f"https://ecos.bok.or.kr/api/StatisticSearch/{api_key}/json/kr/1/1000/901Y009/2000001/M/201901/202412"
    res = requests.get(url)
    json_data = res.json()
    rows = json_data['StatisticSearch']['row']
    df = pd.DataFrame(rows)[['TIME', 'DATA_VALUE']]
    df.columns = ['Date', 'CPI']
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m')
    df['CPI'] = pd.to_numeric(df['CPI'], errors='coerce')
    return df

# 4. 뉴스 수집 및 감성 분석

def fetch_news_rss(keyword, max_items=20):
    feed_url = f"https://news.google.com/rss/search?q={keyword}"
    feed = feedparser.parse(feed_url)
    if feed.bozo:
        print(f"⚠️ RSS 파싱 오류 발생: {feed.bozo_exception}")
        return pd.DataFrame(columns=['title', 'link', 'published', 'sentiment'])
    entries = feed.entries[:max_items]
    cleaned_entries = []
    for e in entries:
        try:
            title = clean_text(e.title)
            link = clean_text(e.link)
            published = clean_text(e.published)
            sentiment = TextBlob(title).sentiment.polarity
            cleaned_entries.append({'title': title, 'link': link, 'published': published, 'sentiment': sentiment})
        except Exception as ex:
            print(f"뉴스 항목 처리 중 오류: {ex}")
            continue
    return pd.DataFrame(cleaned_entries)

# 산업별 뉴스 텍스트 마이닝 분석

def fetch_industry_news_sentiment(industry):
    df = fetch_news_rss(industry + " 산업")
    if df.empty:
        return 0.0, 0
    return df['sentiment'].mean(), len(df)

# 5. 시스템 사고 아케타입 진단

def diagnose_archetype(df):
    result = {
        "성장의 한계": False,
        "지연된 반응": False,
        "의도와 다른 결과": False
    }
    recent_growth = df['Return'].rolling(60).mean().iloc[-1]
    if recent_growth < 0:
        result["성장의 한계"] = True
    if df['Return'].rolling(30).std().iloc[-1] > 0.04:
        result["지연된 반응"] = True
    return result

# 6. 괘 변화 경고 시스템

def energetic_shift_warning(df):
    recent_vol = df['Return'].rolling(10).std().iloc[-1]
    avg_vol = df['Return'].rolling(60).std().mean()
    if recent_vol > avg_vol * 1.5:
        return "⚠️ 괘 변화 가능성: 급격한 형세 전환 징후 감지됨"
    else:
        return "형세 안정 상태"

# 7. 산업 구조 분석 (확장)

INDUSTRY_DB = {
    'AAPL': '반도체',
    'MSFT': 'AI'
}

INDUSTRY_GROWTH = {
    '반도체': 0.12,
    '전기차': 0.08,
    'AI': 0.20,
    '헬스케어': 0.05
}

INDUSTRY_RISK = {
    '반도체': '중간',
    '전기차': '높음',
    'AI': '높음',
    '헬스케어': '낮음'
}

INDUSTRY_DOMINANCE = {
    '반도체': '강함',
    '전기차': '중간',
    'AI': '약함',
    '헬스케어': '강함'
}

def analyze_industry(ticker):
    sector = INDUSTRY_DB.get(ticker, '기타')
    sentiment, article_count = fetch_industry_news_sentiment(sector)
    return pd.DataFrame([{
        '산업': sector,
        '성장률': INDUSTRY_GROWTH.get(sector, 0.00),
        '위험도': INDUSTRY_RISK.get(sector, '불명'),
        '시장 지배력': INDUSTRY_DOMINANCE.get(sector, '불명'),
        '감성 점수': sentiment,
        '기사 수': article_count
    }])

# 8. 정량 분석

def run_statistical_analysis(df):
    df = df.dropna()
    corr = df[['Close', 'Volume', 'Return']].corr()
    X = df[['Volume']]
    y = df['Close']
    model = LinearRegression().fit(X, y)
    return corr, model.coef_[0], model.intercept_

# 9. 시계열 예측 + RMSE + 시각화

def run_arima_forecast(df, ticker):
    series = df['Close'].dropna()
    train, test = series[:-FORECAST_PERIOD], series[-FORECAST_PERIOD:]
    model = ARIMA(train, order=(5,1,0)).fit()
    forecast = model.forecast(steps=FORECAST_PERIOD)
    rmse = np.sqrt(mean_squared_error(test, forecast))
    forecast.index = test.index
    plt.figure(figsize=(10, 4))
    plt.plot(series, label='Actual')
    plt.plot(forecast.index, forecast, label='Forecast', linestyle='--')
    plt.title(f"{ticker} ARIMA 예측 (RMSE={rmse:.2f})")
    plt.legend()
    plt.savefig(f"{base_path}/{ticker}_forecast.png")
    plt.close()
    return forecast, rmse

# 메인 실행
# for TICKER in TICKERS:
for TICKER in ANALYSIS_TICKERS:
    raw_data = fetch_price_data(TICKER, START_DATE, END_DATE)
    cleaned_data = clean_data(raw_data)
    cpi_data = fetch_bok_cpi(BANK_API_KEY)
    news_df = fetch_news_rss(TICKER)
    archetype_result = diagnose_archetype(cleaned_data)
    energetic_status = energetic_shift_warning(cleaned_data)
    industry_df = analyze_industry(TICKER)
    correlation_matrix, coef, intercept = run_statistical_analysis(cleaned_data)
    forecast_result, rmse = run_arima_forecast(cleaned_data, TICKER)

    # 시각화
    plt.figure(figsize=(12, 6))
    plt.plot(cleaned_data['Close'], label='종가 추이')
    plt.title(f'{TICKER} 가격 추이')
    plt.grid()
    plt.savefig(f"{base_path}/{TICKER}_price_plot.png")
    plt.close()

    sns.histplot(cleaned_data['Return'].dropna(), bins=50, kde=True)
    plt.title(f'{TICKER} 일간 수익률 분포')
    plt.savefig(f"{base_path}/{TICKER}_return_hist.png")
    plt.close()

    if not news_df.empty:
        sns.barplot(x=news_df.index, y=news_df['sentiment'])
        plt.title(f'{TICKER} 뉴스 감성 분석')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{base_path}/{TICKER}_sentiment_bar.png")
        plt.close()

    # 보고서 저장
    with pd.ExcelWriter(f"{base_path}/{TICKER}_analysis_report_v1_0.xlsx") as writer:
        raw_data.to_excel(writer, sheet_name="원시 데이터")
        cleaned_data.to_excel(writer, sheet_name="정제 데이터")
        cpi_data.to_excel(writer, sheet_name="CPI 지표")
        news_df.to_excel(writer, sheet_name="뉴스 요약")
        pd.DataFrame([archetype_result]).to_excel(writer, sheet_name="시스템 사고 진단")
        pd.DataFrame({"에너지 변화": [energetic_status]}).to_excel(writer, sheet_name="괘 분석")
        industry_df.to_excel(writer, sheet_name="산업 구조 분석")
        correlation_matrix.to_excel(writer, sheet_name="상관분석")
        pd.DataFrame({"회귀 계수": [coef], "절편": [intercept]}).to_excel(writer, sheet_name="회귀분석")
        pd.DataFrame({"날짜": forecast_result.index, "ARIMA 예측": forecast_result.values, "RMSE": [rmse]*FORECAST_PERIOD}).to_excel(writer, sheet_name="시계열 예측", index=False)
