#This will not run on online IDE
import requests
import io,os
from bs4 import BeautifulSoup
import json

def remove_tags(html):
   # parse html content
   soup = BeautifulSoup(html, "html5lib")
   for data in soup(['script']):
      data.decompose()
   return ' '.join(soup.stripped_strings)

file1 = open('Tutorials List.html', 'r')
Lines = file1.readlines()
count = 0
i = 0
json_data = []
jdata = {}
# Strips the newline character
for line in Lines:
   count += 1

   tutorial = line.strip()

   elist = ['questions_answers.htm','quick_guide.htm','quick-guide.htm','useful_resources.htm','resources.htm','discussion.htm']

   URL = "https://www.tutorialspoint.com/"+tutorial+"/index.htm"
   r = requests.get(URL)
     
   soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' 

   toc = soup.find_all("ul", class_="toc chapters")
   
   for toc_elements in toc:
       li_elements = toc_elements.find_all("a",href=True)   
       for a in li_elements:
         title = a.text
         title = title.replace(" - Home", "")
         ahref = a['href']

         filename, file_extension = os.path.splitext(ahref)
         filename = os.path.basename(ahref)
         fname = filename.replace(tutorial+"_", "")
         
         if file_extension == '.htm' and fname not in elist:
            k = 0
            for lst in elist:
               if lst in fname:
                  k = 1
            if k == 1:
               continue

            page_url = ahref 

            if 'www.tutorialspoint.com' not in ahref:    
               page_url = "https://www.tutorialspoint.com"+ahref     

            pr = requests.get(page_url)  
            if pr.status_code != 200:
               continue   
            psoup = BeautifulSoup(pr.content, 'html5lib') 
            h1_element = psoup.find("h1")
            h1_text = title
            if h1_element:
               h1_text = h1_element.text

            obj = {
               'name' : h1_text,
               'url' : page_url
            }
            json_data.append(obj)  
            #print(h1_text+' ### '+page_url+"\n")
            json_object = json.dumps(obj, indent = 4)
            with open("tp_links_json_data.json", "a") as outfile:
               outfile.write(json_object)

            # check for sub pages            
            psoup = BeautifulSoup(pr.content, 'html5lib') 
            for s in psoup.select('script'):
               s.extract()
            #psoup = remove_tags(pr.content)              
            #psoup = BeautifulSoup(psoup, 'html5lib') 
            main_element = psoup.find(id="mainContent")
            #main_element = psoup.find("div", attrs = {"id":"mainContent"})
            #print(psoup)
            if main_element:
               main_td_elements = main_element.find_all("a")      
               for main_a_element in main_td_elements:
                  #main_a_element = main_td_element.find("a",href=True) 
                  if not main_a_element:
                     continue
                  main_a_element_text = main_a_element.text
                  main_a_element_text = main_a_element_text.replace(" - Home", "")
                  
                  if 'Previous' in main_a_element_text:
                     continue
                  if 'Next' in main_a_element_text:
                     continue                  
                  if 'Print' in main_a_element_text:
                     continue                                  
                  if 'Discussion' in main_a_element_text:
                     continue                                                   
                  if 'Click here' in main_a_element_text:
                     continue                                                                      
                  if 'click here' in main_a_element_text:
                     continue                                                                 
                  if 'Quick Guide' in main_a_element_text:
                     continue                                                                  
                  if 'PDF Version' in main_a_element_text:
                     continue                                                                                   
                  if 'Live Demo' in main_a_element_text:
                     continue    
                  if 'Resources' in main_a_element_text:
                     continue
                  if 'New Quiz' in main_a_element_text:
                     continue                  
                  if 'Start Test' in main_a_element_text:
                     continue                 
                  if 'Finish Test' in main_a_element_text:
                     continue                
                  if 'Contact Us' in main_a_element_text:
                     continue                               
                  if 'docs.cloud' in main_a_element_text:
                     continue
                  
                  if 'href' not in main_a_element:
                     continue  
                  main_a_element_href = main_a_element['href']
                  #print(main_a_element_text+' --- '+main_a_element_href+"\n")
                  filename, file_extension = os.path.splitext(main_a_element_href)
                  filename = os.path.basename(main_a_element_href)
                  fname = filename.replace(tutorial+"_", "")
                  
                  if file_extension == '.htm' and fname not in elist:
                     k = 0
                     for lst in elist:
                        if lst in fname:
                           k = 1
                     if k == 1:
                        continue

                     page_url = main_a_element_href  
                     if 'www.tutorialspoint.com' not in main_a_element_href:    
                        page_url = "https://www.tutorialspoint.com"+main_a_element_href     
                     
                     #print(' $$$ '+page_url+"\n")

                     pr = requests.get(page_url)     
                     if pr.status_code != 200:
                        continue   
                     psoup = BeautifulSoup(pr.content, 'html5lib') 
                     h1_element = psoup.find("h1")
                     h1_text = title
                     if h1_element:
                        h1_text = h1_element.text

                     obj = {
                        'name' : h1_text,
                        'url' : page_url
                     }
                     json_data.append(obj)  
                     json_object = json.dumps(obj, indent = 4)
                     with open("tp_links_json_data.json", "a") as outfile:
                        outfile.write(json_object)
            i = i+1
           
            #exit()
jdata['data'] = json_data
# Serializing json 
json_object = json.dumps(jdata, indent = 4)
  
# Writing to sample.json
with open("tp_links_json_data_all_subpages_28_05_2024.json", "w") as outfile:
    outfile.write(json_object)
   
    
