/* **********************************************************************
    Copyright 2003 Rensselaer Polytechnic Institute.

    All worldwide rights reserved. A license to use, copy, modify and
    distribute this software for noncommercial research purposes only is
    hereby granted, provided that this copyright notice and accompanying
    disclaimer is not modified or removed from the software.

    DISCLAIMER: The software is distributed" AS IS" without any express or
    implied warranty, including but not limited to, any implied warranties
    of merchantability or fitness for a particular purpose or any warrant)'
    of non-infringement of any current or pending patent rights. The authors
    of the software make no representations about the suitability of this
    software for any particular purpose. The entire risk as to the quality
    and performance of the software is with the user. Should the software
    prove defective, the user assumes the cost of all necessary servicing,
    repair or correction. In particular, neither Rensselaer Polytechnic
    Institute, nor the authors of the software are liable for any indirect,
    special, consequential, or incidental damages related to the software,
    to the maximum extent the law permits.
*/

package edu.rpi.sss.util;

import org.apache.log4j.Logger;

import java.util.Vector;

/**
 *
 * @author  Mike Douglass
 * @version
 * ParseURL.java
 *
 * Created on January 12, 2000, 10:01 AM 10:01 AM
 * I'd really like to use the URL stuff here but we're unable to do that
 * because the java code enforces a number of security restrictions that
 * complicate matters.
 *
 * In any case, the needs are somewhat specialized so maybe it won't matter.

      A URL looks like:
        <scheme>://<host>:<port>/<path>?<query>[?query]...#ref
      Each part is optional and is signalled by the presence of certain
      character strings.
      <scheme>  presence signalled by "://"
      <host>      presence signalled by other than one of "/", "?", "#"
      <port>      presence signalled by ":" in host string
      <path>      indicated by "/"
      <query>     indicated by "?"
      <ref>       indicated by "#"

   One problem is that of distinguishing between a non-fully specified path
   and a host. For example
      luwakChannel://RPInews
   Is RPInews a host or is it a channel name?
   We need to unambiguously distinguish the two. I think we will require a
   leading "/" to indicate the presence of otherwise of the host. So the above
   will be interpreted as a host name and
      luwakChannel:///RPInews
   means it's a path.
  */

public class ParseURL {
  static final String schemeSpec = "://";
  static final int schemeSpecLength = schemeSpec.length();

  public static final String pathSpec = "/";
  static final int pathSpecLength = pathSpec.length();

  public static final String querySpec = "?";
  static final int querySpecLength = querySpec.length();

  public static final String refSpec = "#";
  static final int refSpecLength = refSpec.length();

  /** This is a bit of a kludge to allow us to distinguish simple names from a
      hostname.
     If we don't find any of the usual elements we see if there is this
     char in the string. If there is it's a host, otherwise it's a simple name.
  */
  private static final String hostChar = ".";
  private static boolean allowSimpleNames = true;

  /** These are the parsed elements of the url */
  private String scheme = null;

  private String host = null;
  private int port = -1;

  private DelimitedString path = null;
  private DelimitedString query = null;
  private String ref = null;
  /** end of url properties */

  private Logger log;

  /** parse the url and break it into its components.
   */
  public ParseURL(String url) throws Exception {
    this(url, null);
  }

  /** parse the url and break it into its components.
   */
  public ParseURL(String url, Logger log) throws Exception {
    if (log != null) {
      this.log = log;
    } else {
      this.log = Logger.getLogger(getClass());
    }
    if (url != null) {
      parse(url);
    }
  }

  public void parse(String url) throws Exception {
    boolean trace = ((log != null) && (!log.isDebugEnabled()));
    int len = url.length();
    int pos = 0;
    int temp;
    int queryI = -1;
    int refI = -1;

    if (trace) {
      log.debug("ParseURL: about to parse " + url);
    }

    // Look for a scheme
    temp = url.indexOf(schemeSpec);
    if (temp > 0) {
      // We have one.
      scheme = url.substring(0, temp);
      pos = temp + schemeSpecLength;
    }

    // The host will be found when we look for the other parts.

    // Now look for the path.
    path = new DelimitedString(pathSpec);

    temp = url.indexOf(pathSpec, pos);
    if (temp > 0) {
      if (temp > pos) {
        // Skipped over host+port
        doHost(url.substring(pos, temp));
      }

      temp += pathSpecLength;

      // We have a path - look for the end part.
      queryI = url.indexOf(querySpec, temp);

      if (queryI > 0) {
        path.setStr(decode(url.substring(temp, queryI)));
        if (trace) {
          log.debug("ParseURL: path(q): " + url.substring(temp, queryI));
        }
        pos = queryI /* + querySpecLength wrong?? */;
      } else {
        refI = url.indexOf(refSpec, temp);

        if (refI > 0) {
          path.setStr(decode(url.substring(temp, refI)));
          if (trace) {
            log.debug("ParseURL: path(r): " + url.substring(temp, refI));
          }
          pos = refI /* + refSpecLength */;
        } else {
          // It's the remainder.
          path.setStr(decode(url.substring(temp)));
          if (trace) {
            log.debug("ParseURL: path: " + url.substring(temp));
          }
          pos = len;
        }
      }
    }

    if (pos != len) {
      // There's more to come.
      if (queryI < 0) queryI = url.indexOf(querySpec, pos);
      if (queryI > 0) {
        if (queryI > pos) {
          // Skipped over host+port
          doHost(url.substring(pos, queryI));
        }

        // We have a query. May be followed by ref.
        query = new DelimitedString(querySpec);
        queryI += querySpecLength;

        refI = url.indexOf(refSpec, queryI);

        if (refI > 0) {
          query.setStr(decode(url.substring(queryI, refI)));
          pos = refI /* + refSpecLength */;
        } else {
          // It's the remainder.
          query.setStr(decode(url.substring(queryI)));
          pos = len;
        }
      }
    }

    if (pos != len) {
      // There's more to come.
      if (refI < 0) refI = url.indexOf(refSpec, pos);
      if (refI > 0) {
        if (refI > pos) {
          // Skipped over host+port
          doHost(url.substring(pos, refI));
        }

        // We have a ref. Will be remainder
        refI += refSpecLength;

        ref = decode(url.substring(refI));
        pos = len;
      }
    }

    if ((allowSimpleNames) && (pos == 0)) {
      // See if there is a host separator char in the string.
      if (url.indexOf(hostChar, 0) < 0) {
        // Assume a simple name.
        path.setStr(decode(url)); // We'll set protocol below
        pos = len;
      }
    }

    if (pos != len) {
      // Must be host + port
      doHost(url.substring(pos));
    }

    if (trace) {
      StringBuffer sb = new StringBuffer("ParseURL: ");
      sb.append("scheme=");
      sb.append(scheme);
      sb.append(" host=");
      sb.append(host);
      sb.append(" port=");
      sb.append(port);
      sb.append(" path=");
      sb.append(path.toString());
      if (query != null) {
        sb.append(" query=");
        sb.append(query.toString());
        sb.append(" querysize=");
        sb.append(query.size());
      }
      if (ref != null) {
        sb.append(" ref=");
        sb.append(ref);
      }

      log.debug(sb.toString());
    }
  }

  public void setLog(Logger log) {
    this.log = log;
  }

  public Logger getLog() {
    return log;
  }

  /** Methods returning url components
   */

  public String getScheme() {
    return scheme;
  }

  public String getHost() {
    return host;
  }

  public int getPort() {
    return port;
  }

  /** Return the path as a DelimitedString broken into components
   */
  public DelimitedString getPath() {
    return path;
  }

  /** Return the complete path as a string
   */
  public String getPathStr() {
    return path.getStr();
  }

  /** Return the query as a DelimitedString broken into components
   */
  public DelimitedString getQuery() {
    return query;
  }

  /** Return the complete query as a string
   */
  public String getQueryStr() {
    if (query == null) return null;
    return query.getStr();
  }

  public int getQuerySize() {
    if (query == null) return 0;
    return query.size();
  }

  public String getQueryElement(int i) throws Exception {
    if (query == null) throw new Exception("ParseURL: No query defined");
    if (i >= query.size()) throw new Exception("ParseURL: Illegal query index");
    return query.element(i);
  }

  public String getRef() {
    return ref;
  }

  /** toString return a string representing the URL
      We watch for the case of nothing but a simple name.
   */
  public String toString() {
    if ((path == null) || (path.isNull())) return null;

    boolean simpleName = true;
    StringBuffer sb = new StringBuffer();

    if (scheme != null) {
      sb.append(scheme);
      sb.append(schemeSpec);
      simpleName = false;
    }

    if (host != null) {
      sb.append(host);

      if (port >= 0) {
        sb.append(':');
        sb.append(port);
      }
      simpleName = false;
    }

    if ((query != null) || (ref != null)) {
      simpleName = false;
    }

    if (!simpleName) {
      sb.append(pathSpec);
    }

    path.makeStr();
    sb.append(path.getStr());

    if (query != null) {
      sb.append(querySpec);
      query.makeStr();
      sb.append(query.getStr());
    }

    if (ref != null) {
      sb.append(refSpec);
      sb.append(ref);
    }

    return sb.toString();
  }

  /**
    decode: Method to decode an RFC 1738 encoded string.
    Strings of the form %hh, where 'hh' is the two digit hex representation of
    the character are decoded into the character.
  */
  public static final String decode(String str)	throws Exception {
    if (str.indexOf('%') < 0) return str;

    int len = str.length();
    StringBuffer res = new StringBuffer(len);

    for (int i = 0; i < len; i++) {
      if (str.charAt(i) == '%') {
        i++;
        try {
          res.append((char)Integer.parseInt(str.substring(i, i + 2), 16));
        } catch (Exception e) {
          throw new Exception("Invalid URL encoding: " + str);
        }
        i++;
      } else {
        res.append(str.charAt(i));
      }
    }

    return res.toString();
  }

  /** From rfc 1738
   Octets must be encoded if they have no corresponding graphic
   character within the US-ASCII coded character set, if the use of the
   corresponding character is unsafe, or if the corresponding character
   is reserved for some other interpretation within the particular URL
   scheme.

   No corresponding graphic US-ASCII:

   URLs are written only with the graphic printable characters of the
   US-ASCII coded character set. The octets 80-FF hexadecimal are not
   used in US-ASCII, and the octets 00-1F and 7F hexadecimal represent
   control characters; these must be encoded.

   Unsafe:

   Characters can be unsafe for a number of reasons.  The space
   character is unsafe because significant spaces may disappear and
   insignificant spaces may be introduced when URLs are transcribed or
   typeset or subjected to the treatment of word-processing programs.
   The characters "<" and ">" are unsafe because they are used as the
   delimiters around URLs in free text; the quote mark (""") is used to
   delimit URLs in some systems.  The character "#" is unsafe and should
   always be encoded because it is used in World Wide Web and in other
   systems to delimit a URL from a fragment/anchor identifier that might
   follow it.  The character "%" is unsafe because it is used for
   encodings of other characters.  Other characters are unsafe because
   gateways and other transport agents are known to sometimes modify
   such characters. These characters are "{", "}", "|", "\", "^", "~",
   "[", "]", and "`".

   All unsafe characters must always be encoded within a URL. For
   example, the character "#" must be encoded within URLs even in
   systems that do not normally deal with fragment or anchor
   identifiers, so that if the URL is copied into another system that
   does use them, it will not be necessary to change the URL encoding.

   Reserved:

   Many URL schemes reserve certain characters for a special meaning:
   their appearance in the scheme-specific part of the URL has a
   designated semantics. If the character corresponding to an octet is
   reserved in a scheme, the octet must be encoded.  The characters ";",
   "/", "?", ":", "@", "=" and "&" are the characters which may be
   reserved for special meaning within a scheme. No other characters may
   be reserved within a scheme.

   Usually a URL has the same interpretation when an octet is
   represented by a character and when it encoded. However, this is not
   true for reserved characters: encoding a character reserved for a
   particular scheme may change the semantics of a URL.

   Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
   reserved characters used for their reserved purposes may be used
   unencoded within a URL.

   This method assumes that any of the above chanracter must be encoded, that
   is, the string parameter is a URL fragment, not a complete URL to be encoded.
  */

  // Note; % MUST be done first
  final static char[] specials =
  {' ', '<', '>', '"', '#', '{', '}', '|', '\\', '^', '~', '[', ']',
   '\'', ';', '/', '?', ':', '@', '=', '&'};
  final static int slen = specials.length;

  public static final String encode(String str) throws Exception {
    // First ensure we've done the %
    str = encode(str, '%');

    // Do non-printables
    for (int i = 0; i < 0x1F; i++) {
      if (str.indexOf(i) >= 0) str = encode(str, (char)i);
    }
    for (int i = 0x7F; i <= 0xFF; i++) {
      if (str.indexOf(i) >= 0) str = encode(str, (char)i);
    }

    for (int i = 0; i < slen; i++) {
      if (str.indexOf(specials[i]) >= 0) str = encode(str, specials[i]);
    }

    return str;
  }

  /** Used to encode the given character. Call if any character outside of the
      normal requirements must be encoded.
   */

  private static final char[] hex = {'0', '1', '2', '3', '4', '5', '6', '7', '8',
                              '9', 'A', 'B', 'C', 'D', 'E', 'F'};
  public static final String encode(String str, char ch) throws Exception {
    if (str.indexOf(ch) < 0) return str;

    int len = str.length();
    StringBuffer res = new StringBuffer(len);
    for (int i = 0; i < len; i++) {
      if (str.charAt(i) == ch) {
        i++;
        res.append('%');
        res.append(hex[((ch >> 4) & 0xF0)]);
        res.append(hex[(ch & 0x0F)]);
      }
    }

    return res.toString();
  }

  /** Private methods */

  private void doHost(String hostStr) throws Exception {
    // If there's a ":" the port follows and should be integer.
    int portI = hostStr.indexOf(':');

    if (portI < 0) {
      this.host = hostStr;
      return;
    }

    this.host = hostStr.substring(0, portI);
    String portStr = hostStr.substring(portI + 1);
    try {
      port = Integer.parseInt(portStr);
    } catch (Exception e) {
      throw new Exception("ParseURL: Invalid port " + portStr);
    }
  }

}
