-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcodegen.py
124 lines (77 loc) · 3.39 KB
/
codegen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import requests
import warnings
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
# Set of visited URLs to avoid duplicate scraping
visited_urls = set()
# Set of seen document content to avoid repeating output
seen_doc_content = set()
def scrape_page(url, max_depth=1, current_depth=0):
# Get the HTML content of the URL
response = requests.get(url)
html_content = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Find the main content section of the page
main_content = soup.find('div', {'class': 'section'})
if main_content is None:
# No main content section found, skip scraping this page
return
# Extract the relevant content from the main content section
doc_content = '\n'.join(element.text.strip() for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'p']))
# Display the extracted content to the user
if doc_content not in seen_doc_content:
print(doc_content)
seen_doc_content.add(doc_content)
# Find all the links within the page
urls = [urljoin(url, link.get('href')) for link in soup.find_all('a') if link.get('href') and link.parent.name != 'div' and link.parent.get('class') != 'wy-nav-content' and link['href'].startswith(url)]
# Recursively scrape each link
with ThreadPoolExecutor() as executor:
futures = []
for link in urls:
if current_depth < max_depth and link not in visited_urls:
visited_urls.add(link)
future = executor.submit(scrape_page, link, max_depth, current_depth + 1)
futures.append(future)
# Wait for all tasks to complete
for future in futures:
future.result()
# Get the initial URL
initial_url = input("Enter the URL: ")
# Start scraping from the initial URL
scrape_page(initial_url, max_depth=3)
# Open the text file for writing
with open('output.txt', 'w') as file:
file.write('\n'.join(seen_doc_content))
# In[4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
with open("output.txt", "r") as file:
text = file.read()
inputs = tokenizer([text], truncation=True, padding=True, max_length=1024, return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=100, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)
with open("summ.txt", "w") as file:
file.write(summary)
# In[5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot-small-code-to-text")
model = AutoModelForCausalLM.from_pretrained("codeparrot/codeparrot-small-code-to-text")
def gencode(txt):
# Tokenize the prompt
input_ids = tokenizer.encode(txt, return_tensors="pt")
# Generate code
output = model.generate(input_ids, max_length=100, num_return_sequences=1, num_beams=5)
# Decode the generated code
generated_code = tokenizer.decode(output[0], skip_special_tokens=True)
# Print the generated code
return generated_code
# In[ ]:
# In[ ]: