I have the following code below I am working on, in order to get the relative links to absolute links, duplicate links out and unique links as my output result into CSV file:-
import requests
from bs4 import BeautifulSoup
import csv
page = '
https://www.census.gov/programs-surveys/popest.html'
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
links = soup.find_all("a")
print('Number of links retrieved: ', len (links))
Myset = set()
for link in links:
hrefs = str(link.get("href"))
if hrefs.startswith('None'):
elif hrefs.startswith("#http"):
Myset.add(hrefs[1:])
elif hrefs.startswith('#'):
elif hrefs.startswith('/'):
Myset.add('
https://www.census.gov/programs-surveys/popest.html' + hrefs)
elif hrefs.endswith('.gov'):
Myset.add(hrefs + '/')
else:
Myset.add(hrefs)
f = open('Mytest.csv', 'w')
writer = csv.writer(f, delimiter='', lineterminator='\r')
Mylist = ()
ctr = 0
for x in Myset:
Mylist.append(x)
if not Mylist:
else:
writer.writerow(Mylist)
del Mylist(:)
ctr = 1
But I kept getting errors message like this ones below:
File "<tokenize>", line 6
elif hrefs.startswith("#http"):
^
IndentationError: unindent does not match any outer indentation level
&
File "<ipython-input-14-24882d8efa93>", line 5
elif hrefs.startswith("#http"):
^
SyntaxError: invalid syntax
I am stuck and getting frustrated.