require 'fileutils' require 'csv' require File.expand_path(File.dirname(__FILE__) + "/mbox.rb") require File.expand_path(File.dirname(__FILE__) + "/base.rb") =begin PREREQUISITES and INFORMATION: COOKIES: in order to be able to extract users' email addresses correctly from the Google Group, you will need to have access to a Manager account of the Google Group. Having logged into Google Groups with this Manager account, export the cookies.txt from your browser (I used this Chrome extension to get the cookies.txt file: (Without this step, the email addresses cannot be harvested from the Google Group, and this will mess up creation of new users on Discourse) Once you have the cookies.txt file, the easiest way to get it into your Docker container is to upload it as an attachment to a post in your discourse forum. You can get the URL from the post, and you need to prepend '/var/www/discourse/public' to the URL, which will be something like '/uploads/default/original/1X/245aa0cdc6847cf59647e1c7102e253e99d40b69.txt' INSTRUCTIONS: **run this script from INSIDE your Discourse Docker container** $ ssh <your-discourse-server> $ cd /var/discourse $ ./launcher enter app # apt install sqlite3 libsqlite3-dev git # gem install sqlite3 # cd /var/www/discourse/script/import_scripts # open mbox.rb, and comment out the very last line that calls "perform" # download this script from the internet and save as "googlegroups.rb" # su discourse # ruby googlegroups.rb <name-of-your-google-group-goes-here> =end class ImportScripts::GoogleGroups < ImportScripts::Mbox CATEGORY_MAPPINGS = { "default" => "googlegroup" } def initialize(google_group_name) if google_group_name.blank? raise "No google group name specified!" end @google_group_name = google_group_name # your google group name @first_time = true # true: scrap all, false: scrap using -rss option @use_cookies = false # use manager cookie to recover email @load_users = true # use exported user list to recover email # @copy_from = "/shared/google-group-crawler" # copy from a mounted folder @users = [] # store users from the csv file setup_google_group load_users if @load_users super() end def execute scrape_google_group_to_mbox if @first_time update_google_group_to_mbox unless @first_time super end # a valid csv file called #{@google_group_name}.csv from google groups # is expected in the /tmp folder if @load_users = true def load_users # 0 Email address # 1 Nickname # 2 Group status # 3 Email status # 4 Email preference # 5 Posting permissions # 6 Join year # 7 Join month # 8 Join day # 9 Join hour # 10 Join minute # 11 Join second # 12 Time zone puts "Loading users from /tmp/#{@google_group_name}.csv" users = [] CSV.foreach(File.path("/tmp/#{@google_group_name}.csv")) do |record| user = {} user["email"] = record[0] user["nickname"] = record[1] users << user end @users = users puts end # cross match users to recover email def match_user(users, email) return email if email.nil? i = users.index { |user| user["email"] =~ /#{email.gsub(/\.+@/,".*@")}/} if i users[i] else email end end # a valid cookie file called cookies.txt from google groups # is expected in the /tmp folder if @use_cookies = true def setup_google_group ENV['_GROUP'] = @google_group_name ENV['_WGET_OPTIONS'] = "--load-cookies /tmp/cookies.txt --keep-session-cookies" if @use_cookies puts "" puts "Your Google Group name is #{@google_group_name}" puts "So I'm expecting the Google Group URL to be!forum/#{@google_group_name}" puts "First time importing? #{@first_time.to_s}" puts "Use /tmp/cookies.txt? #{@use_cookies.to_s}" puts "Use /tmp/#{@google_group_name}.csv? #{@load_users.to_s}" if @copy_from.present? puts "Copying existing scrapper from #{@copy_from} to /tmp" system "rm -rf /tmp/google-group-crawler" system "cp -r #{@copy_from} /tmp" system "cp #{@copy_from}/cookies.txt /tmp" if @use_cookies system "cp #{@copy_from}/#{@google_group_name}.csv /tmp" if @load_users end system "chmod -R 777 /tmp/google-group-crawler" end # scrape content of the Google Group using # do everything in /tmp/ def scrape_google_group_to_mbox FileUtils.rm_rf("/tmp/google-group-crawler") # idempotent puts "Clone the Google Group Crawler from icy ..." system 'git clone /tmp/google-group-crawler' # perform the scrape Dir.chdir '/tmp/google-group-crawler/' do system 'chmod +x ./' puts "Start the first pass collection of topics" system './ -sh >' system 'chmod +x ./' puts "Iterate through topics to get messages" system './' system "chmod -R 777 #{@google_group_name}" end end def update_google_group_to_mbox # perform the scrape Dir.chdir '/tmp/google-group-crawler/' do system 'chmod +x ./' puts "Update topics" system './ -rss >' system 'chmod +x ./' puts "Iterate through topics to get messages" system './' system "chmod -R 777 #{@google_group_name}" end end # override def open_db"/tmp/google-group-crawler/index.db") end # override def all_messages puts "Loading all messages" files = Dir["/tmp/google-group-crawler/#{@google_group_name}/mbox/*"] files.each_with_index do |f, idx| raw = mail = Mail.read_from_string(raw) yield mail, f print_status(idx, files.size) end end # override def import_categories mappings = CATEGORY_MAPPINGS.values - ['uncategorized'] create_categories(mappings) do |c| {id: c, name: c} end end # override def extract_name(mail) from_name = nil from = mail[:from] from_email = nil if mail.from.present? from_email = mail.from.dup if from_email.kind_of?(Array) from_email = from_email.first.dup end from_email.gsub!(/ at /, '@') from_email.gsub!(/ \(.*$/, '') search = match_user(@users, from_email) from_email = search["email"] unless search.nil? from_name = search["nickname"] unless search.nil? or search["nickname"].blank? end display_names = from.try(:display_names) if display_names.present? from_name = display_names.first end if from_name.blank? && from.to_s =~ /\(([^\)]+)\)/ from_name = Regexp.last_match[1] end from_name = from.to_s if from_name.blank? [from_email, from_name] end def find_all_topics db = open_db title2id = {} rows = db.execute " SELECT msg_id, title FROM emails AS f WHERE datetime(f.email_date) = ( SELECT min(datetime(e.email_date)) FROM emails AS e WHERE e.title = f.title ); " rows.each do |row| title2id[row[1]] = row[0] end title2id ensure db.close end # override def massage_indices db = open_db mappings = find_all_topics puts "#{mappings.size} topics in total" str_ids = ( {|id| "'#{id}'"}).join(',') db.execute "UPDATE emails SET reply_to = null WHERE msg_id in (#{str_ids})" puts "wiring up replies for these topics" mappings.each_with_index do |kv, idx| db.execute "UPDATE emails SET reply_to = ? WHERE title = ? AND msg_id <> ?", [kv[1], kv[0], kv[1]] print_status(idx, mappings.size) end ensure db.close end # override def create_email_indices db = open_db db.execute "DROP TABLE IF EXISTS emails" db.execute <<-SQL CREATE TABLE emails ( msg_id VARCHAR(995) PRIMARY KEY, from_email VARCHAR(255) NOT NULL, from_name VARCHAR(255) NOT NULL, title VARCHAR(255) NOT NULL, reply_to VARCHAR(955) NULL, email_date DATETIME NOT NULL, message TEXT NOT NULL, category VARCHAR(255) NOT NULL ); SQL db.execute "CREATE INDEX by_title ON emails (title)" db.execute "CREATE INDEX by_email ON emails (from_email)" puts "", "creating indices" all_messages do |mail, filename| category = CATEGORY_MAPPINGS['default'] || 'uncategorized' msg_id = mail['Message-ID'].to_s from_email, from_name = extract_name(mail) title = clean_title(mail['Subject'].to_s) reply_to = mail['In-Reply-To'].to_s date = mail['date'].to_s email_date = "" if date.blank? email_date = date else email_date = DateTime.parse(date).to_s end db.execute "INSERT OR IGNORE INTO emails (msg_id, from_email, from_name, title, reply_to, email_date, message, category) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", [msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category] end ensure db.close end end[0]).perform
