diff options
author | Corentin Chary <corentin.chary@gmail.com> | 2012-09-15 23:20:30 +0200 |
---|---|---|
committer | Corentin Chary <corentin.chary@gmail.com> | 2012-09-15 23:21:30 +0200 |
commit | 9afb31fc65934e387cf93efb2b275946fa254e91 (patch) | |
tree | 1cda09864bba6747e246a0e1eecbad1272674934 /pym | |
parent | euscan: update TODO (diff) | |
download | euscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.gz euscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.bz2 euscan-9afb31fc65934e387cf93efb2b275946fa254e91.zip |
euscan: fix #435118 and #435120
Signed-off-by: Corentin Chary <corentin.chary@gmail.com>
Diffstat (limited to 'pym')
-rw-r--r-- | pym/euscan/__init__.py | 11 | ||||
-rw-r--r-- | pym/euscan/handlers/generic.py | 1 | ||||
-rw-r--r-- | pym/euscan/helpers.py | 27 |
3 files changed, 21 insertions, 18 deletions
diff --git a/pym/euscan/__init__.py b/pym/euscan/__init__.py index 12c4a16..946c63c 100644 --- a/pym/euscan/__init__.py +++ b/pym/euscan/__init__.py @@ -77,11 +77,12 @@ ROBOTS_TXT_BLACKLIST_DOMAINS = [ '(.*)sourceforge(.*)', '(.*)github.com', '(.*)berlios(.*)', - '(.*)qt.nokia.com(.*)', - '(.*)chromium.org(.*)', - '(.*)nodejs.org(.*)', - '(.*)download.mono-project.com(.*)', - '(.*)fedorahosted.org(.*)', + '(.*)qt\.nokia\.com(.*)', + '(.*)chromium\.org(.*)', + '(.*)nodejs\.org(.*)', + '(.*)download\.mono-project\.com(.*)', + '(.*)fedorahosted\.org(.*)', + '(.*)download\.tuxfamily\.org(.*)', ] from out import EuscanOutput diff --git a/pym/euscan/handlers/generic.py b/pym/euscan/handlers/generic.py index fd82c71..0795488 100644 --- a/pym/euscan/handlers/generic.py +++ b/pym/euscan/handlers/generic.py @@ -70,6 +70,7 @@ def scan_html(data, url, pattern): match.group(0)) ) + return results diff --git a/pym/euscan/helpers.py b/pym/euscan/helpers.py index 5e3e6ea..3271811 100644 --- a/pym/euscan/helpers.py +++ b/pym/euscan/helpers.py @@ -373,27 +373,28 @@ def tryurl(fileurl, template): def regex_from_template(template): # Escape - template = re.escape(template) + regexp = re.escape(template) # Unescape specific stuff - template = template.replace('\$\{', '${') - template = template.replace('\}', '}') - template = template.replace('}\.$', '}.$') + regexp = regexp.replace('\$\{', '${') + regexp = regexp.replace('\}', '}') + regexp = regexp.replace('}\.$', '}.$') # Replace ${\d+} - #template = template.replace('${0}', r'([\d]+?)') - template = re.sub(r'(\$\{\d+\}(\.?))+', r'([\w\.]+?)', template) + #regexp = regexp.replace('${0}', r'([\d]+?)') + regexp = re.sub(r'(\$\{\d+\}(\.?))+', r'([\w\.]+?)', regexp) - #template = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', template) - #template = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', template) - #template = re.sub(r'(\$\{\d+\})+', '(.+?)', template) + #regexp = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', regexp) + #regexp = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', regexp) + #regexp = re.sub(r'(\$\{\d+\})+', '(.+?)', regexp) # Full version - template = template.replace('${PV}', _v) + regexp = regexp.replace('${PV}', _v) # End - template = template + r'/?$' - return template + regexp = regexp + r'/?$' + + return regexp def basedir_from_template(template): @@ -417,7 +418,7 @@ def generate_scan_paths(url): path = prefix + ":/" for chunk in chunks: if '${' in chunk: - steps.append((path, regex_from_template(chunk))) + steps.append((path, '^(?:|.*/)' + regex_from_template(chunk))) path = "" else: path += "/" |