#!/usr/bin/env python3 """ Scrape a webpage for emails or links, or both. """ import argparse import re import sys import urllib.request def get(url): page = urllib.request.urlopen(url) byts = page.read() s = byts.decode("utf8") page.close() return s def extract_links(content): return re.findall('"((http|ftp)s?://.*?)"', content) def extract_emails(content): return re.findall("([\w\.,]+@[\w\.,]+\.\w+)", content) if __name__ == "__main__": cli = argparse.ArgumentParser(description=__doc__) cli.add_argument("url", metavar="URL", type=str, help="url to fetch") cli.add_argument("-e", "--emails", action="store_true", help="get emails") cli.add_argument("-l", "--links", action="store_true", help="get links") args = cli.parse_args() if not (args.emails or args.links): cli.error("must specify either -e or -l") content = get(args.url) if args.emails: emails = extract_emails(content) if len(emails) == 0: print("found no emails") else: print("found {} emails:".format(len(emails))) for email in emails: print(" {}".format(email)) if args.links: links = extract_links(content) if len(links) == 0: print("found no links") else: print("found {} links:".format(len(links))) for link in links: print(" {}".format(link[0]))