티스토리 뷰

Web Development

[Python] Web Image Download

§무명소졸§ 2014. 3. 31. 17:28

웹사이트에 올라가 있는 이미지를 PC 에 다운로드 받는 Python 소스 입니다.

(버그가 많습니다 +_+;)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
import codecs
import re
import sys
 
 
def getHttpData(urls):
    realurl = ""
    returnTxt = ""
    if urls.startswith("/"):
        realurl = requrl + urls
    else:
        realurl = urls
    if realurl not in duplicate_chk_url:  # 무한 loop 에 빠질수 있기때문에
        returnTxt = requests.get(realurl).text
        duplicate_chk_url.append(realurl)
        print(realurl)
        downloadImage(realurl)
    else:
        returnTxt = ""
    return returnTxt
 
 
def getUrlList(urls):
    urlList = []
    htmlData = getHttpData(urls)
    for msgt in htmlData.split("\n"):
        result = filterRegex.match(str(msgt))
        if str(result) != 'None':
            urlList.append(result.group(1))
 
    return urlList
 
 
def downloadImage(requrl):
    resp = requests.get(requrl)
    resulthtml = resp.text
    # print(resulthtml)
    prog = re.compile('.*<img.* src="(.*?)("|\?).*')
    imglist = []
    for msgt in resulthtml.split("\n"):
        # print (msgt)
        result = prog.match(str(msgt))
        if str(result) != 'None':
            # print (result.group(1))
            imglist.append(result.group(1))
    for imgurl in imglist:
        if imgurl.startswith('http'):
            imgurl = imgurl
        else:
            m = re.search("http:\/\/.*?\/", requrl)
            extractu = m.group(0)
            imgurl = extractu + imgurl
 
        print(imgurl)
        name = imgurl.split('/')[-1]
        resp = requests.get(imgurl, stream=True)
        with open("c:\\test\\" + name, 'wb') as f:
            for chunk in resp.iter_content(1024):
                f.write(chunk)
 
 
def rec(listparam):
    if len(duplicate_chk_url) < MAX_LIMIT:
        if len(listparam) > 0:
            for ssurl in listparam:
                _list = []
                _list = getUrlList(ssurl)
                rec(_list)
        else:
            pass
            '''print(" "*30)
            print("######## depth finished ########")
            print(" "*30)   '''
    else:
        print("#####  REACHED MAX VALUE ######")
        sys.exit()
 
 
'''함수 구현부 끝'''
filterRegex = re.compile('.*(?:(?<=href=")|(?<=src="))((http|/).*?)(?=").*', re.IGNORECASE)
MAX_LIMIT = 50  # 수집한계 URL
duplicate_chk_url = []
requrl = input("input url: ")
print(" " * 30);
print(" " * 30);
print(" " * 30);
print(" >>>>>>>>>>>>>> url crawling start!!!!!!")
print(" " * 30);
print(" " * 30);
print(" " * 30);
uList = []
uList = getUrlList(requrl)
2014 - 03 - 31
rec(uList)
# print(str(duplicate_chk_url))
 
cs


공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크