from bs4 import BeautifulSoup from urllib.request import * from urllib.parse import * from os import makedirs import os.path, time, re
proc_files = {} # 이미 처리한 파일인지 확인하기 위한 빈 변수 def enum_links(html, base) : soup = BeautifulSoup(html, "html.parser") print("soup = ", soup) links = soup.select("link[rel='stylesheet']") print("links = ", links) links += soup.select("a[href]") print("links+ = ", links) result = [] for a in links : # href 속성을 추출하고, 링크를 절대 경로로 변환 href = a.attrs['href'] url = urljoin(base, href) result.append(url) return result
def download_file(url) : o = urlparse(url) print("o = ", o) savepath = "./" + o.netloc + o.path if re.search(r"/$", savepath) : savepath += "index.html" savedir = os.path.dirname(savepath) if os.path.exists(savepath) : # 모두 다운로드되었는지 확인 return savepath if not os.path.exists(savedir) : # 다운로드 받을 폴더 생성 print("mkdir = ", savedir) makedirs(savedir) try : # 파일 다운받기 print("download = ", url) urlretrieve(url, savepath) time.sleep(1) return savepath except : print("download fail : ", url) return None
def analyze_html(url, root_url) : # html을 분석하고 다운받는 함수 savepath = download_file(url) if savepath is None : return if savepath in proc_files : # 이미 다운로드 받았다면 실행하지 않음. return proc_files[savepath] = True print("analyze_html =", url) html = open(savepath, "r", encoding = "utf-8").read() links = enum_links(html, url) for link_url in links : if link_url.find(root_url) != 0 : if not re.search(r".css$", link_url) : continue if re.search(r".(html|htm)$", link_url) : analyze_html(link_url, root_url) continue download_file(link_url)
if __name__ == "__main__" : url = "https://docs.python.org/3.5/library/" analyze_html(url, url)
|