#!/usr/bin/python3

#	cve-manager : CVE management tool
#	Copyright (C) 2017-2026 Alexey Appolonov
#
#	This program is free software: you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation, either version 3 of the License, or
#	(at your option) any later version.
#
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with this program.  If not, see <http://www.gnu.org/licenses/>.

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

import argparse
import requests
from collections     import defaultdict
from os              import path
from re              import sub as re_sub
from cve_manager.url import CutSchemeAndTrSlash, ParseURL, URL_API_PATH, \
	URL_SPEC, FMT_GITHUB, FMT_SOURCEFG, FMT_PYTHON

DESCRIPTION = '''Collect homepage-URLs and used programming languages
of specified projects. There are special symbols used in the output:
"!" - One of the two things happened: a) The parsing of a specified URL has
failed or b) Received the 404 HTTP response to an initially requested URL;
"?" - Some of the requests where blocked by the hosts (for example,
due to exceeding the request limit);
"-" - The request was executed without errors, but the information corresponding
to this column was not found.
'''
HOSTS = {
	FMT_GITHUB: ('api.github.com/repos', (
		(('homepage',),
			('language',),
			)
		)),
	FMT_SOURCEFG: ('sourceforge.net/rest/p', (
		(('external_homepage', 'moved_to_url',),
			('categories/language/fullname',),
			)
		)),
	FMT_PYTHON: ('pypi.python.org/pypi', (
		(('info/home_page',),
			tuple(),
			)
		)),
	}

RES_URL  = 0
RES_LANG = 1
RES_KEYS = (RES_URL, RES_LANG)

LANG_PYTHON = 'Python'

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Parsing the arguments

argparser = argparse.ArgumentParser(description=DESCRIPTION)
argparser.add_argument(
	'-u', '--urls',
	metavar='URLs', type=str, nargs='+', required=True,
	help='URLs that specifiy the projects'
	)
argparser.add_argument(
	'-l', '--lim',
	metavar='CONSECUTIVE_FAILS_LIMIT', type=int, default=-1,
	help='Number of failed requests to a host before data from that host '
	'is no longer requested'
	)
argparser.add_argument(
	'-v', '--verbose',
	action='store_true',
	help='Run in verbose mode'
	)
args = argparser.parse_args()

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# If requests for some supported host are consecutivly failed specified number
# of times then this host is excluded from subsequent processing

count_consecutive_failed_requests = {k: 0 for k in HOSTS.keys()}
blocked_hosts = {k: False for k in HOSTS.keys()}


def BlockHostIfLimitReached(host_id):

	count_consecutive_failed_requests[host_id] += 1

	if args.lim > 0 and count_consecutive_failed_requests[host_id] > args.lim:
		blocked_hosts[host_id] = True


def ResetCounterIfHostNotBlocked(host_id):

	if not blocked_hosts[host_id]:
		count_consecutive_failed_requests[host_id] = 0

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def GetProjectApiPath(url):

	parsed_url = ParseURL(url)
	if not parsed_url:
		return -1, '', '', ''

	host_id = parsed_url.get(URL_SPEC)
	if host_id == None:
		return -1, '', '', ''

	host_api_path, params = HOSTS.get(host_id, (None, None))
	if not host_api_path:
		return -1, '', '', ''

	project_path = parsed_url.get(URL_API_PATH)
	if not project_path:
		return -1, '', '', f'Can\'t get the path to form the query, URL: "{url}"'

	project_api_path = f'https://{host_api_path}' + \
		('/' if project_path[0] != '/' else '') + project_path

	return host_id, project_api_path, params, ''

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ParseUrls(urls):

	res = defaultdict(set)
	res_err = set()
	warn = set()

	for url in urls:

		host_id, project_api_path, params, w = GetProjectApiPath(url)
		if not project_api_path:
			res_err.add(url)
			if w:
				warn.add(w)
			continue

		res[(host_id, project_api_path, params)].add(url)

	return res, res_err, warn

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Get value of a specified field (arg "fields_var" is a list of sequences of
# fields separated by the '/' symbol) from a given JSON document

def GetValFromJson(fields_variations, json):

	if not fields_variations:
		return '', ''

	err = ''

	for fields in fields_variations:
		res = json
		fields_list = fields.split('/')
		for i, field in enumerate(fields_list):
			if type(res) == list:
				sum_res = []
				for el in res:
					sub_path = '/'.join(fields_list[i:])
					r, err = GetValFromJson([sub_path], el)
					if err:
						sum_res = []
						break
					sum_res.append(re_sub(r'\s+', '-', r))
				res = ' '.join(sum_res) if sum_res else None
				break
			else:
				res = res.get(field) if type(res) == dict else None
				if type(res) == str:
					res = re_sub(r'\s+', '-', res)
			if not res:
				if err:
					err += '; '
				err = f'Can\'t get value of the "{field}" key ' \
					'of the JSON document' if res == None else None
				break
		if res:
			break

	return res, '' if res else err

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def GetProjectInfo(host_id, api_path, params, extracted_urls, extracted_langs,
		level=0):

	if host_id == FMT_PYTHON:
		extracted_langs.add(LANG_PYTHON)

	try:
		resp = requests.get(api_path, timeout=(3.05, 15))
		if resp.status_code != 200:
			err = f'Response {resp.status_code} from {api_path}'
			if resp.status_code == 404:
				if level == 0:
					return set('!'), set('!'), err
				return set(), set(), err
			BlockHostIfLimitReached(host_id)
			return set('?'), set('?'), err
		json = resp.json()
	except Exception as exc:
		err = f'Can\'t get JSON doc, {exc}'
		return set(), set(), err

	current_values = {}
	warnings = set()
	for k in (RES_URL, RES_LANG):
		val, err = GetValFromJson(params[k], json)
		if err:
			warnings.add(f'{api_path}: {err}')
			continue
		if val in ('-', '?', '!'):
			val = ''
		current_values[k] = val

	# If there was a redirect then save the URL to which the redirect occurred
	if all([r.is_redirect for r in resp.history]):
		redirect_url = json.get('html_url')
		if redirect_url:
			extracted_urls.add(redirect_url)

	lang = current_values.get(RES_LANG)
	if lang:
		extracted_langs.add(lang)
	url = current_values.get(RES_URL)
	if url:
		# Prevent an endless loop
		if url in extracted_urls:
			return extracted_urls, extracted_langs, '; '.join(warnings)
		extracted_urls.add(url)
	else:
		return extracted_urls, extracted_langs, '; '.join(warnings)

	# Don't need URLs of the same platform as the source URL
	new_host_id, new_api_path, new_params, err = GetProjectApiPath(url)
	if err or api_path == new_api_path:
		if err:
			warnings.add(err)
		return extracted_urls, extracted_langs, '; '.join(warnings)

	ResetCounterIfHostNotBlocked(host_id)

	if not new_api_path:
		return extracted_urls, extracted_langs, None

	extracted_urls_next, extracted_langs_next, err = \
		GetProjectInfo(new_host_id, new_api_path, new_params, extracted_urls,
			extracted_langs, level + 1)
	if err:
		return extracted_urls, extracted_langs, err

	return extracted_urls | extracted_urls_next, \
		extracted_langs | extracted_langs_next, \
		None

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

if __name__ == '__main__':

	parsed_urls, parsed_with_err_urls, warn = ParseUrls(args.urls)

	res = defaultdict(lambda: defaultdict(set))
	skipped = set()

	for coordinates, initial_urls in parsed_urls.items():
		host_id, api_path, params = coordinates
		if blocked_hosts[host_id]:
			skipped |= initial_urls
			continue
		extracted_urls, extracted_langs, err = \
			GetProjectInfo(host_id, api_path, params, set(), set())
		if err:
			warn.add(err + ", URL(s): " + ", ".join(initial_urls))
		for k, values in ((RES_URL, extracted_urls - initial_urls),
				(RES_LANG, extracted_langs)):
			for url in initial_urls:
				for v in values:
					if k == RES_URL and \
							CutSchemeAndTrSlash(url) == CutSchemeAndTrSlash(v):
						continue
					for el in v.split():
						res[url][k].add(el)

	for initial_urls in parsed_urls.values():
		for url in initial_urls:
			for k in RES_KEYS:
				if not res.get(url,{}).get(k):
					res[url][k].add('?' if url in skipped else '-')

	for url in parsed_with_err_urls:
		for k in RES_KEYS:
			res[url][k] = {'!'}

	for url in args.urls:
		extracted_urls, extracted_langs = \
			[" ".join(sorted(res.get(url, {}).get(k))) for k in RES_KEYS]
		if extracted_urls or extracted_langs:
			print(f'{url} >> {" | ".join([extracted_urls, extracted_langs])}')

	if args.verbose:
		for w in warn:
			print(f'[WARNING: {w}]')

	exit(0)
