visit
Today, We're going to build a script that scrapes Twitter to gather stock ticker symbols. We'll use those symbols to scrape yahoo finance for stock Options data. To ensure we can download all the Options data, we’ll make each web . In the end, we’ll do some Pandas magic to pull the first out of the money call contract for each symbol into the final watchlist.
You should be familiar with Docker, Python 3.8, setting up a with , and be comfortable working with DataFrames. Also, it helps to know a little bit about and .
You will need all the above requirements properly install to follow along with this tutorial. It may work on another OS, but I've only tested it on Linux.
Let's prepare the virtual environment using poetry. Open the terminal and run the following commands.
$ poetry new options_bot
$ cd options_bot
.
├── options_bot
│ └── __init__.py
├── pyproject.toml
├── README.rst
└── tests
├── __init__.py
└── test_options_bot.py
2 directories, 5 files
You will have a directory tree that looks like this. The options_bot and tests directories won't be used. Poetry will only be used to manage the dependencies and virtual environment.
$ poetry add nitter-scraper requests-whaor yfs pandas
$ poetry shell
$ touch main.py
from concurrent.futures import as_completed, ThreadPoolExecutor
from nitter_scraper import NitterScraper
import pandas
from requests_whaor import RequestsWhaor
from yfs import fuzzy_search, get_options_page
from concurrent.futures import as_completed, ThreadPoolExecutor
from nitter_scraper import NitterScraper
import pandas
from requests_whaor import RequestsWhaor
from yfs import fuzzy_search, get_options_page
We'll be scraping the Twitter Account for cashtags. A cashtag is similar to a hashtag but begins with a $ and is normal associated with a , , or . @eWhispers tweets about upcoming stock earnings and averages about 30 to 40 cashtags per tweet. This should give you a ton of stock ticker symbols to play with.
cashtag_list = []
with NitterScraper(port=8008) as nitter:
for tweet in nitter.get_tweets("eWhispers", pages=1):
if tweet.is_pinned:
continue
if tweet.is_retweet:
continue
if tweet.entries.cashtags:
cashtag_list += tweet.entries.cashtags
print(".", end="", flush=True) # Simple progress bar.
print() # End progress bar with newline.
cashtag_list = sorted(set(map(lambda cashtag: cashtag.replace("$", "").strip(), cashtag_list)))
cashtag_list = []
The cashtag_list will hold all cashtags found from the @eWhispers tweets.
with NitterScraper(port=8008) as nitter:
The ContextManager will start the nitter docker container and returns a nitter object. The port 8008 is used to ensure we start up the docker container on a unique port.
for tweet in nitter.get_tweets("eWhispers", pages=1):
Here we use the method to scrape tweets. We only want to search the first page. Each page will yield approximately 20 tweets.
if tweet.is_pinned:
continue
if tweet.is_retweet:
continue
if tweet.entries.cashtags:
cashtag_list += tweet.entries.cashtags
Skip the pinned tweet and retweets. Then check if the content of the If a list of cashtags is found, they are added to the cashtag_list.
print(".", end="", flush=True) # Simple progress bar.
print() # End progress bar with newline.
This will print a simple progress bar to keep us from getting bored.
cashtag_list = sorted(set(map(lambda cashtag: cashtag.replace("$", "").strip(), cashtag_list)))
Now we sort, remove duplicates, and clean dollar signs $ from each cashtag. Now we have a clean list of cashtag symbols, almost ready to start searching for option data.
valid_symbols = []
call_chains = []
MAX_THREADS = 6
MAX_PROXIES = 6
valid_symbols = []
Before downloading options data, we'll validate each ticker against yahoo finance's quote lookup. The yfs library provides the function, which uses the quote lookup to verify each symbol is a US stock ticker. After we verify each symbol exists and is a US stock symbol, we append it to the valid_symbols list.
This is the quote lookup fuzzy_search uses to validate symbols.
call_chains = []
objects found from symbols are stored in the call_chains list.
MAX_THREADS = 6
MAX_THREADS is the max amount of threads the ThreadPoolExecutor and RequestsWhaor are allowed to use. RequestsWhaor can use threads to speed up starting and stopping Docker containers.
MAX_PROXIES is the size of the rotating proxy pool. Each proxy is a separate docker container running a .
You can modify the MAX_THREADS and MAX_PROXIES variables to fit your system's performance.
with RequestsWhaor(onion_count=MAX_PROXIES, max_threads=MAX_THREADS) as request_whaor:
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
futures = [
executor.submit(fuzzy_search, ticker, session=request_whaor) for ticker in cashtag_list
]
for future in as_completed(futures):
try:
result = future.result(timeout=60)
# timeout if the response takes too long.
if result:
valid_symbols.append(result.symbol)
print(".", end="", flush=True) # Simple progress bar.
except Exception as exc:
# We want to pass on exceptions.
print("\n", exc)
print() # End progress bar with newline.
print("twitter cashtag count:", len(cashtag_list))
print("validated symbol count:", len(valid_symbols))
request_whaor.restart_onions() # Fresh proxy pool.
futures = [
executor.submit(
get_options_page,
ticker,
after_days=60,
first_chain=True,
use_fuzzy_search=False,
session=request_whaor,
page_not_found_ok=True,
)
for ticker in valid_symbols
]
for future in as_completed(futures):
try:
result = future.result(timeout=60)
# timeout if the response takes too long.
if result:
call_chains.append(result.calls)
print(".", end="", flush=True) # Simple progress bar.
except Exception as exc:
# We want to pass on exceptions.
print("\n", exc)
print() # End progress bar with newline.
with RequestsWhaor(onion_count=MAX_PROXIES, max_threads=MAX_THREADS) as request_whaor:
The ContextManager will take care of starting up the rotating proxy network. In this example, only two arguments are passed. MAX_PROXIES is passed to the , which is the number of docker containers running TOR circuits. MAX_THREADS is passed to determine how many threads will be used to start and stop asynchronously. We will use the yielded object to pass as a , giving a fresh pool of proxies.
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
The is used to execute the and functions asynchronously.
futures = [
executor.submit(fuzzy_search, ticker, session=request_whaor) for ticker in cashtag_list
]
A list comprehension is used to iterate over each ticker in the cashtag_list. Each ticker will be passed to the fuzzy_search function as an argument. Additionally, we pass the This lets fuzzy_search send GET requests with requests_whaor vs. the vanilla module. requests_whaor will ensure requests are retried on failed responses and connection, timeout, and proxy errors.
The executor's takes care of scheduling the fuzzy_search function and returns a Future object. Read more about the here.
for future in as_completed(futures):
try:
result = future.result(timeout=60)
# timeout if the response takes too long.
if result:
valid_symbols.append(result.symbol)
print(".", end="", flush=True) # Simple progress bar.
except Exception as exc:
# We want to pass on exceptions.
print("\n", exc)
print() # End progress bar with newline.
We use the function to iterate over the returned futures as they complete. The is called on each to get the returned value. If a valid result is returned, it is appended to the valid_symbols list.
We pass on all Exceptions to keep things running. Also, we use the same progress bar pattern as we did when we scraped tweets.
print("twitter cashtag count:", len(cashtag_list))
print("validated symbol count:", len(valid_symbols))
Here we'll print the count of cashtags we found on Twitter and symbols we found when validating with fuzzy_search to compare.
request_whaor.restart_onions() # Fresh proxy pool.
Now that we have made about 100 plus requests to yahoo finance's servers, we'll want to get a fresh pool of proxies before making the next
round of requests. We'll use the method to get a fresh pool
of proxies to proxy our next round of requests through.
futures = [
executor.submit(
get_options_page,
ticker,
after_days=60,
first_chain=True,
use_fuzzy_search=False,
session=request_whaor,
page_not_found_ok=True,
)
for ticker in valid_symbols
]
This is similar to the fuzzy_search futures section. We are just passing the
function to the executor and a few more arguments. Additionally, we are iterating over the results in the valid_symbols list. Let's go over each argument.
for future in as_completed(futures):
try:
result = future.result(timeout=60)
# timeout if the response takes too long.
if result:
call_chains.append(result.calls)
print(".", end="", flush=True) # Simple progress bar.
except Exception as exc:
# We want to pass on exceptions.
print("\n", exc)
print() # End progress bar with newline.
Again pretty similar to the fuzzy_search section. We iterate over the
returned futures as they are completed and call the result method to get
the values. Now, the function will return an
containing the call and put options data. After
checking if the result exists, we append only the to the
call_chains list. In the end, we pass on any exceptions.
Panda's Magic
options_watchlist = []
for chain in call_chains:
dataframe = chain.dataframe
otm = dataframe["in_the_money"] == False
single_contract = dataframe[otm].head(1)
options_watchlist.append(single_contract)
final = pandas.concat(options_watchlist, ignore_index=True)
final["expiration"] = final["expiration_date"].dt.date
final.sort_values(by="implied_volatility", inplace=True)
final.reset_index(inplace=True)
final.drop(
columns=["index", "timestamp", "contract_name", "expiration_date", "in_the_money"],
inplace=True,
)
print(final)
This is the final section. Here we'll use Pandas to clean, concatenate and sort the final DataFrame. Here's the code. Let's break it down.
options_watchlist = []
Now we have a bunch of call option chains from multiple symbols. We will store the first out of the money strike from each option chain in the options_watchlist.
for chain in call_chains:
dataframe = chain.dataframe
otm = dataframe["in_the_money"] == False
single_contract = dataframe[otm].head(1)
options_watchlist.append(single_contract)
Next, we iterate over each call option chain and convert each into a DataFrame using the property. Then we filter out all rows where the in_the_money column is False and use the DataFrame's to get the first one OTM contract. After that, the single OTM contract row DataFrame is appended to the options_watchlist.
final = pandas.concat(options_watchlist, ignore_index=True)
We use the Pandas method to concatenate the options_watchlist of single row DataFrames into one named final.
final["expiration"] = final["expiration_date"].dt.date
Here we convert the expiration_date column from a . Additionally, the column is renamed to expiration. This will help shorten up the output.
final.sort_values(by="implied_volatility", inplace=True)
Now, we by implied_volatility because why not. ¯\_(ツ)_/¯
final.reset_index(inplace=True)
final.drop(
columns=["index", "timestamp", "contract_name", "expiration_date", "in_the_money"],
inplace=True,
)
We a few columns to shorten up the output some more.
print(final)
Run the script
$ python3 main.py
symbol contract_type strike last_price bid ask change percent_change volume open_interest implied_volatility expiration
0 CMTL call 17.5 1.50 0.00 0.00 0.00 None 6.0 318.0 3.13 2021-01-15
1 ONB call 15.0 0.75 0.00 0.00 0.00 NaN 19.0 519.0 6.25 2020-12-18
2 VZ call 60.0 1.30 1.27 1.32 -0.10 -7.14 414.0 35068.0 18.34 2021-01-15
3 PG call 145.0 4.60 4.15 4.50 0.10 2.22 70.0 1014.0 20.01 2020-12-18
4 JNJ call 150.0 3.92 3.90 4.05 0.17 4.53 73.0 3615.0 20.01 2020-12-18
.. ... ... ... ... ... ... ... ... ... ... ... ...
148 ACB call 5.0 0.45 0.45 0.46 -0.15 -25 342.0 1935.0 116.41 2020-12-18
149 QTT call 2.5 0.40 0.20 0.75 -0.10 -20 17.0 100.0 117.19 2021-01-15
150 LLNW call 6.0 1.25 1.20 1.25 -0.10 -7.41 49.0 2775.0 125.78 2020-12-18
151 SANW call 2.5 0.55 0.35 1.55 0.00 None 1.0 6.0 193.75 2021-02-19
152 BCLI call 15.0 6.25 5.70 6.50 -0.65 -9.42 11.0 918.0 296.00 2020-12-18
[153 rows x 12 columns]
I hope you had fun writing the script. It would be effortless to modify it to search multiple Twitter users or dump the option data to a database periodically. Thanks for reading.
CONTACT INFO
= = = = Discord = @dgnsrekt
Email =
from concurrent.futures import as_completed, ThreadPoolExecutor
from nitter_scraper import NitterScraper
import pandas
from requests_whaor import RequestsWhaor
from yfs import fuzzy_search, get_options_page
cashtag_list = []
with NitterScraper(port=8008) as nitter:
for tweet in nitter.get_tweets("eWhispers", pages=1):
if tweet.is_pinned:
continue
if tweet.is_retweet:
continue
if tweet.entries.cashtags:
cashtag_list += tweet.entries.cashtags
print(".", end="", flush=True) # Simple progress bar.
print() # End progress bar with newline.
cashtag_list = sorted(set(map(lambda cashtag: cashtag.replace("$", "").strip(), cashtag_list)))
valid_symbols = []
call_chains = []
MAX_THREADS = 6
MAX_PROXIES = 6
with RequestsWhaor(onion_count=MAX_PROXIES, max_threads=MAX_THREADS) as request_whaor:
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
futures = [
executor.submit(fuzzy_search, ticker, session=request_whaor) for ticker in cashtag_list
]
for future in as_completed(futures):
try:
result = future.result(timeout=60)
# timeout if the response takes too long.
if result:
valid_symbols.append(result.symbol)
print(".", end="", flush=True) # Simple progress bar.
except Exception as exc:
# We want to pass on exceptions.
print("\n", exc)
print() # End progress bar with newline.
print("twitter cashtag count:", len(cashtag_list))
print("validated symbol count:", len(valid_symbols))
request_whaor.restart_onions() # Fresh proxy pool.
futures = [
executor.submit(
get_options_page,
ticker,
after_days=60,
first_chain=True,
use_fuzzy_search=False,
session=request_whaor,
page_not_found_ok=True,
)
for ticker in valid_symbols
]
for future in as_completed(futures):
try:
result = future.result(timeout=60)
# timeout if the response takes too long.
if result:
call_chains.append(result.calls)
print(".", end="", flush=True) # Simple progress bar.
except Exception as exc:
# We want to pass on exceptions.
print("\n", exc)
print() # End progress bar with newline.
options_watchlist = []
for chain in call_chains:
dataframe = chain.dataframe
otm = dataframe["in_the_money"] == False
single_contract = dataframe[otm].head(1)
options_watchlist.append(single_contract)
final = pandas.concat(options_watchlist, ignore_index=True)
final["expiration"] = final["expiration_date"].dt.date
final.sort_values(by="implied_volatility", inplace=True)
final.reset_index(inplace=True)
final.drop(
columns=["index", "timestamp", "contract_name", "expiration_date", "in_the_money"],
inplace=True,
)
print(final)