Skip to content


Migrating to Magento

Recently, I’ve been helping a friend migrate is web “store” from a collection of static html pages (2000+) to magento.  The old store worked but wasn’t really functional.  Everything was hardcoded and there was no shopping cart.  After doing some research, I decided to give magento a try.  This meant, we had to migrate the existing static html pages, both “product” pages and “expository” pages.

I started by writing a perl script to parse the pages and pull out the relevant product data.  The original pages were done in one of the WYSIWYG editors, so there was a somewhat standard format.  I used the perl IdentityParse package (HTML::Parser) to parse and, where necessary, preserve the original html files. I was able to pull out ~750 products, with their attributes (e.g. price, description, sku, etc) and in some places, the images associated with the product.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/perl -w
use strict;
no warnings "all";
package IdentityParse;
use base "HTML::Parser";
 
my $plainContent;
my $description;
my $format="";
my $price="";
my $itemNum="";
my $length="";
my $author="";
my $allLines = "";
my $inMainTable = 0;
my $inSecondaryTable=0;
my $inHeader=0;
my $inUL=0;
my $navTrailText = "";
my %categoryHierarcy;
 
sub trim($)
{
	my $string = shift;
	$string =~ s/^\s+//;
	$string =~ s/\s+$//;
	return $string;
}
 
 
sub text {
	my ($self, $text) = @_;
	# just print out the original text
	$text =~ s/\s{2,}/ /g;
	$text =~ tr/"/""/;
	$plainContent.= $text;
}
 
sub start {
	my ($self, $tag, $attr, $attrseq, $origtext) = @_;
 
	if ($tag =~ m/^table$/i && $attr->{'width'} =~ m/^583$/i) {
		$inMainTable=1;
	} 
	elsif ($tag =~ m/^table$/i){
		$inSecondaryTable=1;
		$plainContent.=$origtext;
	}
	elsif ($tag =~ m/^tr$/i && $inMainTable) {
		if ($inSecondaryTable){
			$plainContent.=$origtext;
		}
 
	}
	elsif ($tag =~ m/^td$/i && $inMainTable) {
		if ($inSecondaryTable){
			$plainContent.=$origtext;
		}
	}
	elsif ($tag =~m/^h1$/i && $attr->{'class'} =~ m/^hdrPage$/i){
		$inHeader = 1;
		$plainContent.= "<div class=\"page-head\"><h3>";
	}
	elsif ($tag =~m/^p$/i && $attr->{'class'} =~ m/^hdrPage$/i){
		$inHeader = 1;
		$plainContent.= "<div class=\"page-head\"><h3>";
	}
	elsif ($tag=~m/^img$/i && $attr->{'src'} =~ m/spacer\.gif$/){
 
	}
	elsif ($tag=~m/^img$/i){
		my $imagePath = $attr->{'src'};
		my @imagePathParts = split(/\//,$imagePath);
		my $partsLen = @imagePathParts;
		my $filename = $imagePathParts[$partsLen-1];
		$plainContent.="<img src=\"/skin/frontend/4ulr/blue/images/$filename\" align=\"right\">";
	}
	elsif ($tag=~/^ul$/i) {
		$inUL=1;
		$plainContent.="<ul class=\"disc\">";
	}
	else{
		$plainContent.=$origtext;
	}
 
}
sub end {
	my ($self, $tag, $attr, $attrseq, $origtext) = @_;
 
	if ($tag =~ m/^table$/i && $inMainTable) {
		if ($inSecondaryTable){
			$inSecondaryTable=0;
			$plainContent.=$attr;
		}
		else {
			$inMainTable=0;
		}
	}
	elsif ($tag =~ m/^tr$/i && $inMainTable) {
		if($inSecondaryTable){
			$plainContent.=$attr;
		}
	}
	elsif ($tag =~ m/^td$/i && $inMainTable) {
		if($inSecondaryTable){
			$plainContent.=$attr;
		}
	}
	elsif($tag =~m/^h1$/i && $inHeader){
		$inHeader=0;
		$plainContent.="</h3></div><div class=\"content\">";
	}
	elsif($tag =~m/^p$/i && $inHeader){
		$inHeader=0;
		$plainContent.="</h3></div><div class=\"content\">";
	}
	elsif ($tag=~/^ul$/i && $inUL==1) {
		$plainContent.=$attr;
	}
	else{
		$plainContent.=$attr;
	}
 
}
 
 
package main;
my $p = new IdentityParse;
my $categories={};
my @files = get_htmls("/home/juice/4ulr/parsing/products/counseling/");
open(OUTPUT, ">counselinglistHTML.csv");
print OUTPUT "filename\tbodyContent\n";
 
 
foreach my $file (@files){
	print "$file\n";
	$plainContent="";
	$description="";
	$allLines="";
	$format="";
	$price="";
	$itemNum="";
	$length="";
	$author="";
 
	open(MYINPUTFILE, "<$file"); # open for input
	my(@lines) = <MYINPUTFILE>; # read file into list
	my $title="";
	my $content="";
	my $contentStart=0;
	my @contentLines;
	chomp(@lines);
	foreach my $line (@lines){
	    chomp($line);
	    $line=~ s/\r//g;
		$allLines .= $line;
		if ($line =~ m/<title>(.*)<\/title>/i){
			$title = $1;
			$title =~ tr/"/""/;
			$title = trim($title);
		}
 
		if ($line =~ m/<!---------- tbl main content ---------->/i){
			$contentStart = 1;
		}
		if ($contentStart){
			push(@contentLines, $line);
		}
		if ($line =~ m/<!-------- \/ tbl main content ---------->/i){
			$contentStart = 0;
		}
	}
	print "\n";
 
	foreach my $contentLine (@contentLines){
		$content.=$contentLine;
	}
	#parse to get content
	$plainContent="";
	$p->parse($content);
	my $vanillaContent = $plainContent;
   	$vanillaContent=~s/\s{2,}/ /g;
	$vanillaContent.="</div>";
   	#print $plainContent;
	print "title = $title\n";
#	print "description = $plainDescription\n";
#	print "bodyContent = $vanillaContent\n";
#	print "format = $format\n";
#	print "price = $price\n";
#	print "itemNum = $itemNum\n";
#	print "length = $length\n";
#	print "author = $author\n";
	$p->eof;
	$navTrailText="";
	print OUTPUT "$file\t$vanillaContent\n";
	close MYINPUTFILE;
}
 
sub trim($)
{
	my $string = shift;
	$string =~ s/^\s+//;
	$string =~ s/\s+$//;
	return $string;
}
 
sub get_htmls {
    my $path    = shift;
 
    opendir (DIR, $path)
        or die "Unable to open $path: $!";
 
    my @files =
        map { $path . '/' . $_ }
        grep { !/^\.{1,2}$/ }
        readdir (DIR);
 
    # Rather than using a for() loop, we can just
    # return a directly filtered list.
    return
        grep { (/\.html*$/) && (! -l $_) }
        map { -d $_ ? get_htmls ($_) : $_ }
        @files;
}

I then started using Magento’s web service to load the data.  Unfortunatley it was slow and I couldn’t get some of the custom attributes to load.  I found some relevant posts in the magento forums, and switched over to php script using the magento classes to load the data.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/php
<?php
define('MAGENTO', realpath('/var/www/magento'));
ini_set('memory_limit', '128M');
 
require_once MAGENTO . '/app/Mage.php';
 
Mage::app();
 
//$ourFileName = "listshort.csv"
$ourFileName = "Catalog_short.csv";
$fh = fopen($ourFileName, 'r') or die("Can't open file");
while (!feof($fh)) {
/*	$theData = fgets($fh);
	$theData = ereg_replace('"',"",$theData);
	echo($theData);
	list($filename,$title,$imageName, $keywords, $description,$bodyContent,$format,$price,$itemNum,$length,$author, $catHier) = split("\t", $theData);
	preg_match('@/home/juice/4ulr/parsing/products(\/.*)\.htm[l]?@', $filename, $matches);
	$urlKey = $matches[1];*/
	list($title,$description, $bodyContent, $format, $length,$author,$itemNum, $price,$rental,$keywords,$drop) = fgetcsv($fh);
 
	$price = ereg_replace("[^0-9.]","",$price);
	$imageSet=false;
/*	if (strlen($imageName)){
		$imageName = "/var/www/magento/media/catalog/product" . $imageName;
		$imageSet = true;
	}*/
	if (strlen($itemNum) > 0 && strlen($price) > 0){
//		echo "$imageName\n";
 
	    $product = Mage::getModel('catalog/product');
	    $product->setTypeId('simple');
	    $product->setTaxClassId(0); //none
	    $product->setWebsiteIds(array(1));  // store id
	    $product->setAttributeSetId(4);
//	    $product->setMediaFormat(6);
//	    $product->setMetaCategory(array(3,4));
	    $product->setSku(ereg_replace("\n","",$itemNum));
	    $product->setName(ereg_replace("\n","",$title));
	    $product->setDescription($bodyContent);
	    $product->setInDepth($bodyContent);    
	    $product->setPrice($price);
	    $product->setShortDescription(ereg_replace("\n","",$description));
	    $product->setWeight(0);
	    $product->setStatus(1);
	    $product->setVisibilty(4);
	    $product->setMetaDescription(ereg_replace("\n","",$description));
	    $product->setMetaTitle(ereg_replace("\n","",$title));
	    $product->setMetaKeywords($keywords);
//	    $product->setUrlKey($urlKey);
	    $visibility = array (
	            'thumbnail',
	            'small_image',
	            'image'
	    );
	    if ($imageSet){
	    	try {
	    		$product->addImageToMediaGallery($imageName,$visibility,false, false);
	    	}
	    	catch (Exception $e) {echo "no image\n";}
	    }     
	    try{
	    	$product->save();
	    	echo "$price, $itemNum added\n";
	    }
	    catch (Exception $e){ 		
	    	echo "$price, $itemNum not added\n";
	    } 
 
 
	    $stockItem = Mage::getModel('cataloginventory/stock_item');
	    $stockItem->loadByProduct($product->getId());
	    //var_dump($stockItem);
 
	    if (!$stockItem->getId()) {
	        $stockItem->setProductId($product->getId())->setStockId(1);
	    }
	    $stockItem->setData('qty', 0);
	    $stockItem->setData('is_in_stock', 1);
 
	    $stockItem->save();
	}
	else {
		echo "$price, $itemNum not added\n";
	}
} 
 
?>

I then went back and began parsing the “expository” pages in the site.  Again using the magento classes I was able to create “CMS pages”.  Now I just need to change all the links so that they work 8).

Posted in magento.

Tagged with , , .


0 Responses

Stay in touch with the conversation, subscribe to the RSS feed for comments on this post.



Some HTML is OK

or, reply to this post via trackback.

Spam Protection by WP-SpamFree