#!/usr/bin/env python3
"""
台股新聞數據收集 (DIKW Layer D - 用 Haiku 3.5)
"""
import requests
import yaml
import json
import os
from datetime import datetime
from bs4 import BeautifulSoup

# 20 檔觀察名單
WATCHLIST = [
    "2382", "3231", "6669", "2356", "2308",  # AI伺服器
    "3680", "6196", "2404", "6139", "3413",  # 半導體設備
    "2408", "2344", "8299", "3260", "2451",  # 記憶體
    "1513", "1503", "1514", "2371"           # 能源 (台達電不重複)
]

def fetch_cnyes_news(ticker):
    """從鉅亨網抓取個股新聞"""
    try:
        url = f"https://news.cnyes.com/news/cat/tw_stock"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # 簡化版：抓標題關鍵字
            news_titles = []
            for title_elem in soup.find_all(['h1', 'h2', 'h3'], limit=20):
                title = title_elem.get_text().strip()
                if ticker in title:
                    news_titles.append(title)
            return news_titles[:3]  # 最多 3 條相關新聞
        
    except Exception as e:
        print(f"抓取 {ticker} 新聞失敗: {e}")
    
    return []

def fetch_goodinfo_data(ticker):
    """從 Goodinfo 抓取基本面資料"""
    # 簡化版：返回固定結構，實際可以解析網頁
    return {
        "pe_ratio": "N/A",
        "roe": "N/A", 
        "eps_growth": "N/A",
        "last_price": "N/A"
    }

def collect_all_data():
    """收集所有觀察名單的數據"""
    today = datetime.now().strftime('%Y-%m-%d')
    all_data = {
        "date": today,
        "stocks": {}
    }
    
    for ticker in WATCHLIST:
        print(f"🔍 收集 {ticker} 數據...")
        
        stock_data = {
            "ticker": ticker,
            "news": fetch_cnyes_news(ticker),
            "fundamentals": fetch_goodinfo_data(ticker),
            "collected_at": datetime.now().isoformat()
        }
        
        all_data["stocks"][ticker] = stock_data
    
    # 儲存原始數據
    output_file = f"data/market-data/raw-{today}.json"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)
    
    print(f"✅ 數據收集完成: {output_file}")
    return all_data

if __name__ == "__main__":
    data = collect_all_data()