Skip to content

Browser 使用指南

模块路径: FQBase.Crawler.browser源码: [browser.py](file:///Users/A.D.189/FQuant/FQuant.Server/FQBase/FQBase/Crawler/browser.py)


一、基本使用

1.1 HTTP 请求模式

python
from FQBase.Crawler.browser import BaseCrawler, PageParser

class SimpleCrawler(BaseCrawler):
    def __init__(self):
        super().__init__(use_browser=False, delay=1.0)

    def crawl(self, url):
        html = self.fetch_url(url)
        return html

crawler = SimpleCrawler()
html = crawler.crawl('https://example.com')
crawler.close()

1.2 浏览器模式

python
from FQBase.Crawler.browser import BaseCrawler

class BrowserCrawler(BaseCrawler):
    def __init__(self):
        super().__init__(use_browser=True, timeout=120)

    def crawl(self, url):
        html = self.fetch_url_with_browser(url, wait_for='body')
        return html

with BrowserCrawler() as crawler:
    html = crawler.crawl('https://example.com')

二、上下文管理器

python
from FQBase.Crawler.browser import BaseCrawler

# 推荐:使用 with 语句自动关闭
with BaseCrawler(use_browser=True) as crawler:
    html = crawler.fetch_url_with_browser('https://example.com')
    # 自动调用 close()

# 等价于
crawler = BaseCrawler(use_browser=True)
try:
    html = crawler.fetch_url_with_browser('https://example.com')
finally:
    crawler.close()

三、页面解析

3.1 正则提取

python
from FQBase.Crawler.browser import PageParser

html = '<div class="title">Hello World</div>'

titles = PageParser.extract_by_regex(html, r'<div class="title">(.*?)</div>', group=1)
print(titles)  # ['Hello World']

3.2 CSS 选择器提取

python
html = '''
<div class="product">
    <span class="name">股票A</span>
    <span class="price">12.50</span>
</div>
'''

items = PageParser.extract_by_css(html, '.product', attrs=['class', 'name', 'price'])
print(items)
# [{'class': 'product', 'name': '股票A', 'price': '12.50'}]

3.3 表格提取

python
html = '''
<table>
    <tr><th>代码</th><th>名称</th></tr>
    <tr><td>000001</td><td>平安银行</td></tr>
    <tr><td>000002</td><td>万科A</td></tr>
</table>
'''

tables = PageParser.extract_tables(html)
print(tables)
# [[['代码', '名称'], ['000001', '平安银行'], ['000002', '万科A']]]

3.4 链接和图片

python
html = '''
<a href="/page/1">Page 1</a>
<a href="/page/2">Page 2</a>
<img src="/images/1.png"/>
'''

links = PageParser.extract_links(html, base_url='https://example.com')
print(links)
# ['https://example.com/page/1', 'https://example.com/page/2']

images = PageParser.extract_images(html, base_url='https://example.com')
print(images)
# ['https://example.com/images/1.png']

四、浏览器交互

4.1 等待并点击

python
from FQBase.Crawler.browser import BaseCrawler
from selenium.webdriver.common.by import By

crawler = BaseCrawler(use_browser=True)

crawler.fetch_url_with_browser('https://example.com')

crawler.wait_and_click('button.next-page', by=By.CSS_SELECTOR)

4.2 获取元素文本

python
title = crawler.get_element_text('h1.title', timeout=10, default='No Title')
print(title)

4.3 滚动到元素

python
crawler.scroll_to_element('div.loading-more')

五、BrowserPool

python
from FQBase.Crawler.browser import BrowserPool

pool = BrowserPool(max_browsers=3)

browser1 = pool.get_browser()
browser2 = pool.get_browser()
browser3 = pool.get_browser()
browser4 = pool.get_browser()  # 轮换回 browser1

pool.close_all()

六、量化交易场景

6.1 爬取股票列表

python
from FQBase.Crawler.browser import BaseCrawler, PageParser

class StockListCrawler(BaseCrawler):
    def __init__(self):
        super().__init__(use_browser=True, delay=2.0)

    def crawl_stock_list(self, exchange='SSE'):
        url = f'http://www.sse.com.cn/market/dealingdata/overview/c/{exchange.lower()}.shtml'
        html = self.fetch_url_with_browser(url, wait_for='table')

        tables = PageParser.extract_tables(html)
        if tables:
            header = tables[0][0]
            data = tables[0][1:]
            return {'header': header, 'data': data}
        return None

with StockListCrawler() as crawler:
    result = crawler.crawl_stock_list()
    print(result)

6.2 爬取财经新闻

python
class NewsCrawler(BaseCrawler):
    def __init__(self):
        super().__init__(use_browser=False, delay=1.0)

    def crawl_news(self, keyword='量化交易'):
        url = f'https://example.com/search?q={keyword}'
        html = self.fetch_url(url)

        titles = PageParser.extract_by_regex(
            html,
            r'<h3 class="news-title">(.*?)</h3>',
            group=1
        )

        links = PageParser.extract_links(html, pattern=r'/news/\d+')

        return [{'title': t, 'link': l} for t, l in zip(titles, links)]

with NewsCrawler() as crawler:
    news = crawler.crawl_news('AI')
    for item in news:
        print(item['title'], item['link'])

6.3 批量爬取财务报表

python
class FinanceCrawler(BaseCrawler):
    def __init__(self):
        super().__init__(use_browser=True, delay=3.0)

    def crawl_financial_report(self, stock_code):
        url = f'https://example.com/finance/{stock_code}'
        html = self.fetch_url_with_browser(url, wait_for='#financial-table')

        data = PageParser.extract_by_css(
            html,
            '#financial-table tr',
            attrs=['class', 'text']
        )

        return {'code': stock_code, 'data': data}

    def batch_crawl(self, codes):
        results = []
        for code in codes:
            try:
                result = self.crawl_financial_report(code)
                results.append(result)
            except Exception as e:
                print(f'Failed to crawl {code}: {e}')
        return results

with FinanceCrawler() as crawler:
    results = crawler.batch_crawl(['000001', '000002', '000004'])

七、完整示例

python
from FQBase.Crawler.browser import BaseCrawler, PageParser

class StockPriceCrawler(BaseCrawler):
    def __init__(self):
        super().__init__(
            use_browser=True,
            timeout=60,
            delay=2.0,
        )

    def get_stock_price(self, code):
        url = f'https://example.com/stock/{code}'
        html = self.fetch_url_with_browser(url, wait_for='.stock-price')

        price = self.get_element_text('.stock-price .price', default='N/A')
        change = self.get_element_text('.stock-price .change', default='N/A')

        return {
            'code': code,
            'price': price,
            'change': change,
        }

if __name__ == '__main__':
    with StockPriceCrawler() as crawler:
        stock = crawler.get_stock_price('000001')
        print(stock)