-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmanage_links.py
605 lines (572 loc) · 25.6 KB
/
manage_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
#!/usr/bin/env python3
import argparse
import json
import os
import requests
import base64
import openai
import re
import sys
import logging
from urllib.parse import urlparse
try:
import jsonschema
from jsonschema import validate
except ImportError:
print("jsonschema module not found. Please install it via pip install jsonschema")
sys.exit(1)
# -------------------------------------------------------------------
# Hard-coded finite artifact (your categories, header, title, etc.)
HARDCODED_DATA = {
"title": "Awesome Video",
"header": "# Awesome Video\n\nProjects are organized into detailed categories with extensive subcategories covering all aspects of video processing, streaming, encoding, and more.",
"header_contributing": "Please refer to the contribution guidelines before adding new projects.",
"categories": [
{
"name": "Video Players & Playback Libraries",
"subcategories": [
{"name": "Web Players"},
{"name": "Mobile Players"},
{"name": "Desktop Players"},
{"name": "Smart TV Players"},
{"name": "Set-top Box Players"},
{"name": "Embedded Players"},
{"name": "Frameworks & UI Components"},
{"name": "Browser Extensions"}
]
},
{
"name": "Video Editing & Processing Tools",
"subcategories": [
{"name": "Trimming & Cutting Tools"},
{"name": "Conversion & Format Tools"},
{"name": "Repair & Recovery Tools"},
{"name": "Non-linear Editing Suites"},
{"name": "Effects & Compositing Tools"},
{"name": "Color Grading & Correction Tools"},
{"name": "Subtitle & Caption Tools"},
{"name": "Batch Processing & Automation"}
]
},
{
"name": "Video Encoding, Transcoding & Packaging Tools",
"subcategories": [
{"name": "FFmpeg-Based Tools"},
{"name": "Hardware Accelerated Transcoding"},
{"name": "Software Transcoding Tools"},
{"name": "Scripting & Automation Tools"},
{"name": "Containerization & Packaging Tools"},
{"name": "Cloud-Based Encoding Solutions"},
{"name": "Multi-format Packaging Tools"},
{"name": "Real-Time Encoding Solutions"}
]
},
{
"name": "Video Streaming & Distribution Solutions",
"subcategories": [
{"name": "Live Streaming Servers"},
{"name": "VOD Streaming Servers"},
{"name": "CDN Integration & Distribution"},
{"name": "RTMP/RTSP/HTTP Protocol Servers"},
{"name": "Peer-to-Peer Streaming Solutions"},
{"name": "Multi-CDN Management"},
{"name": "Edge Computing & Caching Solutions"},
{"name": "Streaming Analytics & Monitoring"}
]
},
{
"name": "Adaptive Streaming & Manifest Tools",
"subcategories": [
{"name": "HLS Tools"},
{"name": "DASH Tools"},
{"name": "CMAF & fMP4 Packaging"},
{"name": "HLS Manifest Parsers & Generators"},
{"name": "DASH Manifest Tools"},
{"name": "Encryption & DRM for Adaptive Streaming"},
{"name": "Low-Latency Streaming Tools"},
{"name": "Adaptive Bitrate Algorithms & Tools"}
]
},
{
"name": "Media Analysis, Quality Metrics & AI Tools",
"subcategories": [
{"name": "Quality Analysis & Metrics"},
{"name": "Scene Detection & Segmentation"},
{"name": "AI & Machine Learning Tools"},
{"name": "Video Analytics & Benchmarking"},
{"name": "Audio Analysis & Processing"},
{"name": "VMAF, PSNR, SSIM Tools"},
{"name": "Color Science & Histogram Analysis"},
{"name": "Metadata Extraction & Management"}
]
},
{
"name": "Build Tools, Deployment & Utility Libraries",
"subcategories": [
{"name": "Docker & Containerization Tools"},
{"name": "Build Scripts & Automation"},
{"name": "Command-line Utilities & Wrappers"},
{"name": "API Libraries & SDKs"},
{"name": "Performance & Monitoring Tools"},
{"name": "CI/CD Pipelines for Media"},
{"name": "Logging & Debugging Tools"},
{"name": "Infrastructure as Code for Video"}
]
},
{
"name": "Standards, Specifications & Industry Resources",
"subcategories": [
{"name": "Video Codec Specifications"},
{"name": "Adaptive Streaming Standards"},
{"name": "DRM & Content Protection Standards"},
{"name": "Closed Captioning & Subtitling Standards"},
{"name": "Industry Forums & Standards Bodies"},
{"name": "Regulatory & Compliance Resources"},
{"name": "Best Practices & Guidelines"},
{"name": "Open Source Licensing & Patents"}
]
},
{
"name": "Learning, Tutorials & Documentation",
"subcategories": [
{"name": "Video Streaming Tutorials"},
{"name": "Encoding & Transcoding Guides"},
{"name": "Player Development Documentation"},
{"name": "Subtitle & Caption Tutorials"},
{"name": "Books & Courses"},
{"name": "Case Studies & Whitepapers"},
{"name": "Webinars & Conference Talks"},
{"name": "Community Blogs & Forums"}
]
},
{
"name": "Transcoding, Codecs & Hardware Acceleration",
"subcategories": [
{"name": "Software Codecs"},
{"name": "Hardware Codecs & Acceleration"},
{"name": "Open Source Encoder Projects"},
{"name": "GPU Transcoding Libraries"},
{"name": "Benchmarking & Performance Tools for Codecs"},
{"name": "Comparative Analysis of Codecs"},
{"name": "Multi-format Transcoding Solutions"},
{"name": "Next-Generation Codecs (AV1, VVC)"}
]
},
{
"name": "DRM, Security & Content Protection",
"subcategories": [
{"name": "DRM Solutions & Implementations"},
{"name": "Encryption Tools for Streaming"},
{"name": "License Management Systems"},
{"name": "Widevine, FairPlay, PlayReady Integrations"},
{"name": "Secure Packaging & Manifest Encryption"},
{"name": "Content Watermarking & Fingerprinting"},
{"name": "DRM Testing & Validation Tools"},
{"name": "Case Studies & Best Practices in DRM"}
]
},
{
"name": "Miscellaneous, Experimental & Niche Tools",
"subcategories": [
{"name": "Test Content & Sample Streams"},
{"name": "Experimental Projects & Prototypes"},
{"name": "Community & Collaboration Platforms"},
{"name": "Legacy & Obsolete Tools"},
{"name": "Research Projects & Academic Resources"},
{"name": "Independent & Hobbyist Projects"},
{"name": "Cross-Platform Media Tools"},
{"name": "Specialized Utility Scripts"}
]
}
]
}
# -------------------------------------------------------------------
# JSON schema for validation
SCHEMA = {
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"properties": {
"title": {"type": "string"},
"header": {"type": "string"},
"header_contributing": {"type": "string"},
"categories": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"subcategories": {
"type": "array",
"items": {"type": "object", "properties": {"name": {"type": "string"}}, "required": ["name"]}
}
},
"required": ["name", "subcategories"]
}
},
"projects": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"category": {"type": "string"},
"description": {"type": "string"},
"homepage": {"type": ["string", "null"]},
"tags": {"type": "array"}
},
"required": ["title", "category", "description", "homepage"],
"additionalProperties": False
}
}
},
"required": ["title", "header", "header_contributing", "categories"],
"additionalProperties": False
}
# -------------------------------------------------------------------
# Utility functions
def load_json_file(path):
logging.debug(f"Loading JSON file from: {path}")
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
logging.debug("JSON file loaded successfully.")
return data
def write_json_file(data, path):
logging.debug(f"Writing JSON data to: {path}")
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
logging.info(f"Updated JSON written to {path}")
def validate_json_structure(data, schema):
try:
validate(instance=data, schema=schema)
logging.debug("JSON validated successfully against the schema.")
return True
except jsonschema.exceptions.ValidationError as e:
logging.error(f"JSON validation error: {e}")
return False
def get_allowed_categories(data):
"""
Returns three lists:
- allowed main categories,
- allowed subcategories,
- combined allowed categories.
"""
main_categories = []
subcategories = []
for cat in data.get("categories", []):
if "name" in cat:
main_categories.append(cat["name"])
for sub in cat.get("subcategories", []):
if "name" in sub:
subcategories.append(sub["name"])
combined = main_categories + subcategories
logging.debug(f"Allowed main categories: {main_categories}")
logging.debug(f"Allowed subcategories: {subcategories}")
return main_categories, subcategories, combined
def fetch_readme(repo_url):
"""
Fetch the README content for a given GitHub repository URL using the GitHub API.
Returns the decoded README text or an empty string if not found.
"""
logging.info(f"Fetching README for: {repo_url}")
parsed = urlparse(repo_url)
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) < 2:
logging.error(f"Invalid GitHub URL: {repo_url}")
return ""
owner, repo = path_parts[0], path_parts[1]
api_url = f"https://api.github.com/repos/{owner}/{repo}/readme"
headers = {"Accept": "application/vnd.github.v3+json"}
response = requests.get(api_url, headers=headers)
logging.debug(f"GitHub API URL: {api_url} | Status code: {response.status_code}")
if response.status_code == 200:
data = response.json()
if "content" in data:
try:
content = base64.b64decode(data["content"]).decode("utf-8", errors="replace")
logging.debug(f"Fetched README content of length {len(content)} for {repo_url}")
return content
except Exception as e:
logging.error(f"Error decoding README for {repo_url}: {e}")
return ""
else:
logging.warning(f"No content found in README for {repo_url}")
return ""
else:
logging.error(f"Failed to fetch README for {repo_url}: HTTP {response.status_code}")
return ""
# -------------------------------------------------------------------
# Backend wrappers
def call_ollama_completion(prompt, model, ollama_url, temperature=0.5, max_tokens=200):
"""
Calls the Ollama backend using a configurable URL.
"""
url = f"{ollama_url}/api/models/{model}/completions"
payload = {
"prompt": prompt,
"temperature": temperature,
"max_tokens": max_tokens
}
try:
response = requests.post(url, json=payload)
if response.status_code == 200:
result = response.json()
return result.get("completion", "")
else:
logging.error(f"Ollama API error: HTTP {response.status_code}")
return ""
except Exception as e:
logging.error(f"Error calling Ollama API: {e}")
return ""
def call_model_for_recategorization(readme_content, allowed_main, allowed_sub, allowed_combined,
model, call_type, backend, ollama_url):
"""
Generate a refined summary and suggest both a main category and a subcategory using the selected backend.
"""
prompt = (
"You are an assistant that helps recategorize GitHub repositories based on their README contents. "
"Given the following README content, provide a refined summary (1-2 sentences) describing the project. "
"Then, based on the project's content, suggest the most appropriate main category and a relevant subcategory "
"from the lists provided below.\n\n"
"Allowed Main Categories:\n" + ", ".join(allowed_main) + "\n\n"
"Allowed Subcategories:\n" + ", ".join(allowed_sub) + "\n\n"
"Return the result in valid JSON format with three keys: \"summary\", \"category\", and \"subcategory\". "
"For example: {\"summary\": \"A refined description.\", \"category\": \"Video Streaming & Distribution Solutions\", \"subcategory\": \"Live Streaming Servers\"}.\n\n"
"README:\n" + readme_content
)
logging.debug(f"Prompt (first 300 chars): {prompt[:300]}")
message = ""
if backend.lower() == "openai":
try:
if call_type.lower() == "chat":
logging.info(f"Calling OpenAI ChatCompletion with model {model}")
response = openai.ChatCompletion.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
temperature=0.5,
max_tokens=200,
n=1
)
message = response["choices"][0]["message"]["content"]
elif call_type.lower() == "completion":
logging.info(f"Calling OpenAI Completion with model {model}")
response = openai.Completion.create(
model=model,
prompt=prompt,
temperature=0.5,
max_tokens=200,
n=1,
stop=None,
)
message = response["choices"][0]["text"]
else:
logging.error(f"Invalid call type: {call_type}")
return "", "", ""
except Exception as e:
logging.error(f"Error calling OpenAI API: {e}")
sentences = re.split(r"\. ", readme_content)
fallback_summary = sentences[0].strip()
if fallback_summary and not fallback_summary.endswith("."):
fallback_summary += "."
return fallback_summary, allowed_main[0] if allowed_main else "", ""
elif backend.lower() == "ollama":
logging.info(f"Calling Ollama for model {model} at {ollama_url}")
message = call_ollama_completion(prompt, model, ollama_url, temperature=0.5, max_tokens=200)
if not message:
logging.error("Ollama returned an empty response.")
sentences = re.split(r"\. ", readme_content)
fallback_summary = sentences[0].strip()
if fallback_summary and not fallback_summary.endswith("."):
fallback_summary += "."
return fallback_summary, allowed_main[0] if allowed_main else "", ""
else:
logging.error(f"Invalid backend: {backend}")
return "", "", ""
logging.debug(f"Raw response: {message}")
try:
result = json.loads(message)
except Exception as e:
logging.error(f"Error parsing JSON response: {e}")
return "", "", ""
summary = result.get("summary", "").strip()
main_cat = result.get("category", "").strip()
sub_cat = result.get("subcategory", "").strip()
if main_cat not in allowed_main:
logging.warning(f"Suggested main category '{main_cat}' not allowed. Defaulting.")
main_cat = allowed_main[0] if allowed_main else ""
if sub_cat and sub_cat not in allowed_sub:
logging.warning(f"Suggested subcategory '{sub_cat}' not allowed. Ignoring subcategory.")
sub_cat = ""
logging.info(f"Summary: {summary} | Category: {main_cat} | Subcategory: {sub_cat}")
return summary, main_cat, sub_cat
def extract_repo_name(repo_url):
"""
Extract the repository name from a GitHub URL.
"""
parsed = urlparse(repo_url)
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) >= 2:
return path_parts[1]
return repo_url
# -------------------------------------------------------------------
# Mode functions
def add_new_links(data, repo_file_path, model, call_type, backend, ollama_url):
"""
In add mode:
- For each GitHub repo URL in repo_file_path, fetch its README and use the selected backend
to get a refined summary, main category, and subcategory.
- Combine the main category and subcategory (if available) into a single string.
- Update an existing project (if homepage already exists) or add a new project.
"""
allowed_main, allowed_sub, allowed_combined = get_allowed_categories(data)
with open(repo_file_path, "r", encoding="utf-8") as f:
repo_urls = [line.strip() for line in f if line.strip()]
logging.info(f"Found {len(repo_urls)} repo URLs to process.")
if "projects" not in data:
data["projects"] = []
for repo_url in repo_urls:
logging.info(f"Processing repo: {repo_url}")
readme_content = fetch_readme(repo_url)
if not readme_content:
logging.warning(f"Skipping {repo_url} due to missing README.")
continue
summary, main_cat, sub_cat = call_model_for_recategorization(
readme_content, allowed_main, allowed_sub, allowed_combined, model, call_type, backend, ollama_url
)
category_str = f"{main_cat} – {sub_cat}" if sub_cat else main_cat
project_exists = False
for project in data.get("projects", []):
if project.get("homepage", "").rstrip("/") == repo_url.rstrip("/"):
logging.info(f"Updating existing project for {repo_url}.")
project["description"] = summary
project["category"] = category_str
project_exists = True
break
if not project_exists:
project_title = extract_repo_name(repo_url)
new_project = {
"title": project_title,
"category": category_str,
"description": summary,
"homepage": repo_url,
"tags": []
}
data["projects"].append(new_project)
logging.info(f"Added new project: {project_title}")
return data
def recategorize_projects(data, model, call_type, backend, ollama_url):
"""
In recategorize mode:
- For each project with a GitHub URL in its homepage, fetch its README and use the selected backend
to get a refined summary, main category, and subcategory.
- Update the project's description and category (formatted as "Main – Subcategory" if applicable).
"""
allowed_main, allowed_sub, allowed_combined = get_allowed_categories(data)
if "projects" not in data:
logging.warning("No projects found to recategorize.")
return data
for project in data.get("projects", []):
homepage = project.get("homepage", "")
if "github.com" not in homepage:
continue
logging.info(f"Recategorizing project '{project.get('title', 'Unknown')}' at {homepage}...")
readme_content = fetch_readme(homepage)
if not readme_content:
logging.warning(f"Skipping {homepage} due to missing README.")
continue
summary, main_cat, sub_cat = call_model_for_recategorization(
readme_content, allowed_main, allowed_sub, allowed_combined, model, call_type, backend, ollama_url
)
category_str = f"{main_cat} – {sub_cat}" if sub_cat else main_cat
project["description"] = summary
project["category"] = category_str
logging.info(f"Updated project '{project.get('title', 'Unknown')}' with category '{category_str}'.")
return data
# -------------------------------------------------------------------
# Main
def main():
parser = argparse.ArgumentParser(
description="Manage and categorize GitHub repos using a finite, hard-coded categories artifact. "
"Use mode 'add' to add new repo URLs or 'recategorize' to update existing projects."
)
parser.add_argument("--mode", required=True, choices=["add", "recategorize"],
help="Operation mode: 'add' to add new repo URLs, 'recategorize' to update existing projects.")
parser.add_argument("--json-file", required=True, help="Path to the existing JSON file.")
parser.add_argument("--output-file", required=True, help="Path to save the updated JSON file.")
parser.add_argument("--repo-file", help="Path to the text file containing GitHub repo URLs (required for add mode).")
parser.add_argument("--backend", default="openai", choices=["openai", "ollama"],
help="Backend to use for completions (default: openai).")
parser.add_argument("--model", default="gpt-3.5-turbo",
help="Model to use (for OpenAI or Ollama). For Ollama, ensure the model is available at the endpoint.")
parser.add_argument("--call-type", default="chat", choices=["chat", "completion"],
help="Type of API call to use (for OpenAI; ignored for Ollama).")
parser.add_argument("--ollama-url", default="http://localhost:11434",
help="Base URL for the Ollama backend (default: http://localhost:11434).")
parser.add_argument("--openai-api-key", help="OpenAI API key (or set the OPENAI_API_KEY environment variable).")
parser.add_argument("--log-level", default="DEBUG", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help="Set the logging level.")
args = parser.parse_args()
numeric_level = getattr(logging, args.log_level.upper(), logging.DEBUG)
logging.basicConfig(level=numeric_level, format="%(asctime)s - %(levelname)s - %(message)s")
if args.backend.lower() == "openai":
openai_api_key = args.openai_api_key or os.getenv("OPENAI_API_KEY")
if not openai_api_key:
logging.error("OpenAI API key must be provided via --openai-api-key or the OPENAI_API_KEY environment variable.")
sys.exit(1)
openai.api_key = openai_api_key
try:
data = load_json_file(args.json_file)
except Exception as e:
logging.error(f"Error loading JSON file {args.json_file}: {e}")
sys.exit(1)
# Override title, header, and categories with the hard-coded artifact.
data["title"] = HARDCODED_DATA["title"]
data["header"] = HARDCODED_DATA["header"]
data["header_contributing"] = HARDCODED_DATA["header_contributing"]
data["categories"] = HARDCODED_DATA["categories"]
if not validate_json_structure(data, SCHEMA):
logging.error("Initial JSON does not conform to the schema. Exiting.")
sys.exit(1)
if args.mode == "add":
if not args.repo_file:
logging.error("In 'add' mode, --repo-file must be provided.")
sys.exit(1)
data = add_new_links(data, args.repo_file, args.model, args.call_type, args.backend, args.ollama_url)
else:
data = recategorize_projects(data, args.model, args.call_type, args.backend, args.ollama_url)
if not validate_json_structure(data, SCHEMA):
logging.error("Updated JSON does not conform to the schema. Exiting.")
sys.exit(1)
try:
write_json_file(data, args.output_file)
except Exception as e:
logging.error(f"Error writing JSON to {args.output-file}: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
# Usage Examples
# Add Mode using OpenAI (default):
# python manage_links.py \
# --mode=add \
# --json-file=contents.json \
# --repo-file=repo_urls.txt \
# --output-file=updated_contents.json \
# --backend=openai \
# --model=gpt-3.5-turbo \
# --call-type=chat \
# --openai-api-key=sk-XYZ \
# --log-level=DEBUG
# Recategorize Mode using Ollama with a custom URL:
# python manage_links.py \
# --mode=recategorize \
# --json-file=updated_contents.json \
# --output-file=recategorized_contents.json \
# --backend=ollama \
# --model=your-ollama-model-name \
# --call-type=chat \
# --ollama-url=http://your-custom-ollama-url:port \
# --log-level=DEBUG
# Explanation
# 1. Backend Selectio